In [1]:
# Cellule 1 — Imports de base et configuration

import numpy as np
import torch
import gymnasium as gym
from gymnasium import spaces

from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker

from stable_baselines3.common.callbacks import EvalCallback

from env.workshop_env import WorkshopEnv  # ton environnement atelier

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x1a5de7e8bd0>

In [2]:
# Cellule 2 — Environnement d'entraînement avec masque d'actions

def mask_fn(env: WorkshopEnv):
    """
    Fonction utilisée par ActionMasker pour récupérer le masque des actions.
    """
    return env.get_action_mask()

# Environnement enveloppé pour PPO
env_train = ActionMasker(WorkshopEnv(), mask_fn)

print("Espace d'observation :", env_train.observation_space)
print("Espace d'actions     :", env_train.action_space)


Espace d'observation : Box(0.0, [1.008e+04 1.000e+00 1.000e+02 1.000e+00 1.000e+02 5.000e+01 5.000e+01
 5.000e+01 5.000e+01 1.008e+04 1.000e+03 1.000e+03 1.000e+03 4.000e+00
 5.000e+01 2.000e+02 1.439e+03 2.000e+03 2.000e+04 1.000e+00 1.000e+06
 1.000e+05 1.500e+01], (23,), float32)
Espace d'actions     : Discrete(201)


In [3]:
# ======================================================
# Cellule 3 — Chargement modèle DAgger + réglages PPO SAFE
# ======================================================

from stable_baselines3.common.utils import get_schedule_fn

MODEL_PATH = "student_dagger_final.zip"

# Chargement du modèle issu de DAgger
model_ppo = MaskablePPO.load(
    MODEL_PATH,
    env=env_train,
    device="cpu"
)

print("Modèle initial chargé depuis", MODEL_PATH)

# ======================================================
# Hyperparamètres SAFE pour un finetuning PPO NON destructif
# (on NE TOUCHE PAS à n_steps ni batch_size, pour ne pas casser le buffer)
# ======================================================

# 1) Learning rate très faible : micro-ajustements uniquement
NEW_LR = 1e-5
for param_group in model_ppo.policy.optimizer.param_groups:
    param_group["lr"] = NEW_LR
model_ppo.learning_rate = NEW_LR  # pour cohérence interne

# 2) clip_range doit être une FONCTION (schedule), pas un float
#    Ici on impose un clip constant à 0.05
model_ppo.clip_range = get_schedule_fn(0.05)

# 3) Réduire un peu l’exploration intrinsèque
model_ppo.ent_coef = 0.001

# On laisse n_steps et batch_size tels qu'ils ont été sauvés dans le modèle
print("\n=== Hyperparamètres PPO SAFE configurés ===")
print("learning_rate      =", NEW_LR)
print("clip_range         = constant schedule à 0.05")
print("ent_coef           =", model_ppo.ent_coef)
print("n_steps (conservé) =", model_ppo.n_steps)
print("batch_size (conservé) =", model_ppo.batch_size)
print("================================================\n")


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Modèle initial chargé depuis student_dagger_final.zip

=== Hyperparamètres PPO SAFE configurés ===
learning_rate      = 1e-05
clip_range         = constant schedule à 0.05
ent_coef           = 0.001
n_steps (conservé) = 4096
batch_size (conservé) = 512





In [4]:
# Cellule 4 — Fonction d'évaluation sur N semaines complètes

def evaluate_model_weekly(model, n_episodes: int = 5, max_steps: int = 10080):
    """
    Évalue le modèle sur n_episodes semaines complètes.
    Retourne la liste des rewards.
    """
    rewards = []

    for ep in range(n_episodes):
        env_eval = WorkshopEnv()
        obs, info = env_eval.reset()
        total_reward = 0.0

        for t in range(max_steps):
            mask = env_eval.get_action_mask()
            action, _ = model.predict(
                obs,
                deterministic=True,
                action_masks=mask
            )
            obs, r, terminated, truncated, info = env_eval.step(action)
            total_reward += r

            if terminated or truncated:
                break

        rewards.append(total_reward)
        print(f"Episode {ep+1}/{n_episodes} — reward = {total_reward:.2f}")

    rewards = np.array(rewards, dtype=np.float32)
    print(f"\nReward moyen sur {n_episodes} semaines : {rewards.mean():.2f} ± {rewards.std():.2f}")
    return rewards


In [5]:
# Cellule 5 — Baseline : évaluation AVANT finetuning PPO

print("=== Évaluation AVANT PPO (policy issue de DAgger) ===")
rewards_before = evaluate_model_weekly(model_ppo, n_episodes=5)


=== Évaluation AVANT PPO (policy issue de DAgger) ===
Episode 1/5 — reward = 11012.98
Episode 2/5 — reward = 10339.32
Episode 3/5 — reward = 11129.16
Episode 4/5 — reward = 10696.86
Episode 5/5 — reward = 9855.36

Reward moyen sur 5 semaines : 10606.74 ± 465.22


In [6]:
# Cellule 6 — Environnement d'évaluation + callback

eval_env = ActionMasker(WorkshopEnv(), mask_fn)

eval_callback = EvalCallback(
    eval_env,
    n_eval_episodes=3,              # 3 semaines complètes pour une éval
    eval_freq=20_000,               # tous les 20k timesteps
    best_model_save_path="./ppo_safe_best",
    deterministic=True,
    render=False
)

print("Callback d'évaluation initialisé.")


Callback d'évaluation initialisé.


In [7]:
# Cellule 7 — Entraînement PPO avec contraintes "safe"

TOTAL_TIMESTEPS = 150_000
LOG_NAME = "ppo_safe_finetune_v1"

print(f"=== Entraînement PPO (safe) pour {TOTAL_TIMESTEPS} timesteps ===")

model_ppo.set_env(env_train)  # par sécurité

model_ppo.learn(
    total_timesteps=TOTAL_TIMESTEPS,
    tb_log_name=LOG_NAME,
    callback=eval_callback,
    progress_bar=True
)

print("=== Fin de l'entraînement PPO (safe) ===")


=== Entraînement PPO (safe) pour 150000 timesteps ===
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./tb_dagger_hybrid\ppo_safe_finetune_v1_4


-----------------------------
| time/              |      |
|    fps             | 486  |
|    iterations      | 1    |
|    time_elapsed    | 8    |
|    total_timesteps | 4096 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 400           |
|    iterations           | 2             |
|    time_elapsed         | 20            |
|    total_timesteps      | 8192          |
| train/                  |               |
|    approx_kl            | 0.00029079805 |
|    clip_fraction        | 0.0649        |
|    clip_range           | 0.05          |
|    entropy_loss         | -0.474        |
|    explained_variance   | 0.00217       |
|    learning_rate        | 0.0003        |
|    loss                 | 413           |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.00277      |
|    value_loss           | 857           |
------------------------------------------

-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1.01e+04      |
|    mean_reward          | 3.14e+03      |
| time/                   |               |
|    total_timesteps      | 20000         |
| train/                  |               |
|    approx_kl            | 0.00053530774 |
|    clip_fraction        | 0.0452        |
|    clip_range           | 0.05          |
|    entropy_loss         | -0.439        |
|    explained_variance   | -7.31e-05     |
|    learning_rate        | 0.0003        |
|    loss                 | 268           |
|    n_updates            | 40            |
|    policy_gradient_loss | -0.0021       |
|    value_loss           | 600           |
-------------------------------------------


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.01e+04 |
|    ep_rew_mean     | 1.4e+04  |
| time/              |          |
|    fps             | 225      |
|    iterations      | 5        |
|    time_elapsed    | 90       |
|    total_timesteps | 20480    |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.01e+04      |
|    ep_rew_mean          | 1.4e+04       |
| time/                   |               |
|    fps                  | 238           |
|    iterations           | 6             |
|    time_elapsed         | 103           |
|    total_timesteps      | 24576         |
| train/                  |               |
|    approx_kl            | 0.00027974125 |
|    clip_fraction        | 0.0545        |
|    clip_range           | 0.05          |
|    entropy_loss         | -0.465        |
|    explained_variance   | 0.0181        |


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1.01e+04      |
|    mean_reward          | 2.87e+03      |
| time/                   |               |
|    total_timesteps      | 40000         |
| train/                  |               |
|    approx_kl            | 0.00019705904 |
|    clip_fraction        | 0.0406        |
|    clip_range           | 0.05          |
|    entropy_loss         | -0.384        |
|    explained_variance   | 5.77e-05      |
|    learning_rate        | 0.0003        |
|    loss                 | 329           |
|    n_updates            | 90            |
|    policy_gradient_loss | -0.00153      |
|    value_loss           | 716           |
-------------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.01e+04 |
|    ep_rew_mean     | 1.46e+04 |
| time/              |          |
|    fps             | 224      |
|   

-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1.01e+04      |
|    mean_reward          | 922           |
| time/                   |               |
|    total_timesteps      | 60000         |
| train/                  |               |
|    approx_kl            | 0.00016708934 |
|    clip_fraction        | 0.0492        |
|    clip_range           | 0.05          |
|    entropy_loss         | -0.381        |
|    explained_variance   | 0.000143      |
|    learning_rate        | 0.0003        |
|    loss                 | 319           |
|    n_updates            | 140           |
|    policy_gradient_loss | -0.0016       |
|    value_loss           | 672           |
-------------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.01e+04 |
|    ep_rew_mean     | 1.51e+04 |
| time/              |          |
|    fps             | 222      |
|   

-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1.01e+04      |
|    mean_reward          | 3.44e+03      |
| time/                   |               |
|    total_timesteps      | 80000         |
| train/                  |               |
|    approx_kl            | 0.00024990595 |
|    clip_fraction        | 0.0364        |
|    clip_range           | 0.05          |
|    entropy_loss         | -0.362        |
|    explained_variance   | -0.000196     |
|    learning_rate        | 0.0003        |
|    loss                 | 357           |
|    n_updates            | 190           |
|    policy_gradient_loss | -0.00149      |
|    value_loss           | 762           |
-------------------------------------------


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.01e+04 |
|    ep_rew_mean     | 1.57e+04 |
| time/              |          |
|    fps             | 221      |
|    iterations      | 20       |
|    time_elapsed    | 370      |
|    total_timesteps | 81920    |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.01e+04      |
|    ep_rew_mean          | 1.57e+04      |
| time/                   |               |
|    fps                  | 225           |
|    iterations           | 21            |
|    time_elapsed         | 381           |
|    total_timesteps      | 86016         |
| train/                  |               |
|    approx_kl            | 0.00019713705 |
|    clip_fraction        | 0.033         |
|    clip_range           | 0.05          |
|    entropy_loss         | -0.398        |
|    explained_variance   | 0.00492       |


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1.01e+04     |
|    mean_reward          | 1.01e+03     |
| time/                   |              |
|    total_timesteps      | 100000       |
| train/                  |              |
|    approx_kl            | 7.019212e-05 |
|    clip_fraction        | 0.0267       |
|    clip_range           | 0.05         |
|    entropy_loss         | -0.297       |
|    explained_variance   | -6.83e-05    |
|    learning_rate        | 0.0003       |
|    loss                 | 328          |
|    n_updates            | 240          |
|    policy_gradient_loss | -0.00109     |
|    value_loss           | 680          |
------------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.01e+04 |
|    ep_rew_mean     | 1.62e+04 |
| time/              |          |
|    fps             | 227      |
|    iterations      |

------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1.01e+04     |
|    mean_reward          | 2.54e+03     |
| time/                   |              |
|    total_timesteps      | 120000       |
| train/                  |              |
|    approx_kl            | 8.862999e-05 |
|    clip_fraction        | 0.0222       |
|    clip_range           | 0.05         |
|    entropy_loss         | -0.31        |
|    explained_variance   | 1.88e-05     |
|    learning_rate        | 0.0003       |
|    loss                 | 304          |
|    n_updates            | 290          |
|    policy_gradient_loss | -0.00108     |
|    value_loss           | 650          |
------------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.01e+04 |
|    ep_rew_mean     | 1.64e+04 |
| time/              |          |
|    fps             | 228      |
|    iterations      |

-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1.01e+04      |
|    mean_reward          | -1.95e+03     |
| time/                   |               |
|    total_timesteps      | 140000        |
| train/                  |               |
|    approx_kl            | 0.00017972986 |
|    clip_fraction        | 0.0349        |
|    clip_range           | 0.05          |
|    entropy_loss         | -0.362        |
|    explained_variance   | 7.37e-05      |
|    learning_rate        | 0.0003        |
|    loss                 | 295           |
|    n_updates            | 340           |
|    policy_gradient_loss | -0.00146      |
|    value_loss           | 580           |
-------------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.01e+04 |
|    ep_rew_mean     | 1.65e+04 |
| time/              |          |
|    fps             | 231      |
|   

=== Fin de l'entraînement PPO (safe) ===


In [20]:
# ======================================================
# Cellule 8 — Évaluation du modèle PPO finetuné (CORRIGÉE)
# ======================================================

def evaluate_agent(model, n_episodes=5, max_steps=10080):
    rewards = []

    for ep in range(n_episodes):
        env = WorkshopEnv()            # environnement brut
        obs, info = env.reset()
        total_r = 0.0

        for t in range(max_steps):

            mask = env.get_action_mask()

            # PREDICTION CORRECTE POUR MASKABLEPPO
            action, _ = model.predict(
                obs,                    # simple vecteur
                deterministic=True,
                action_masks=mask       # masque passé séparément
            )

            obs, r, terminated, truncated, info = env.step(action)
            total_r += r

            if terminated or truncated:
                break

        rewards.append(total_r)
        print(f"Episode {ep+1}/{n_episodes} — reward = {total_r:.2f}")

    rewards = np.array(rewards, dtype=np.float32)
    print(f"\nReward moyen sur {n_episodes} semaines : "
          f"{rewards.mean():.2f} ± {rewards.std():.2f}")

    return rewards, rewards.mean(), rewards.std()


# ========= CHARGEMENT DU BEST MODEL ==========
BEST_MODEL_PATH = "./ppo_safe_best/best_model.zip"

try:
    best_model = MaskablePPO.load(BEST_MODEL_PATH, device="cpu")
    print(f"\nBest model chargé depuis : {BEST_MODEL_PATH}")
except Exception as e:
    print("\n⚠ Impossible de charger le best model, utilisation du model_ppo final.")
    best_model = model_ppo


# ========= ÉVALUATION ==========
print("\n=== Évaluation APRÈS PPO (policy finetunée) ===")
rewards_after, mean_after, std_after = evaluate_agent(best_model, n_episodes=5)



Best model chargé depuis : ./ppo_safe_best/best_model.zip

=== Évaluation APRÈS PPO (policy finetunée) ===
Episode 1/5 — reward = 13045.70
Episode 2/5 — reward = 12649.44
Episode 3/5 — reward = 12940.06
Episode 4/5 — reward = 13098.62
Episode 5/5 — reward = 12289.66

Reward moyen sur 5 semaines : 12804.70 ± 300.72


In [22]:
# ======================================================
# cellule 9 : Sauvegarde propre du modèle PPO finetuné
# ======================================================

import os
from datetime import datetime

# Dossier de sauvegarde
SAVE_DIR = "ppo_final_model"
os.makedirs(SAVE_DIR, exist_ok=True)

# Nom du fichier = date + heure pour versioning automatique
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
MODEL_PATH = os.path.join(SAVE_DIR, f"ppo_finetuned_{timestamp}.zip")

# 1) Sauvegarde du modèle complet
best_model.save(MODEL_PATH)

# 2) Sauvegarde des hyperparamètres dans un fichier texte
with open(os.path.join(SAVE_DIR, "hyperparams.txt"), "w") as f:
    f.write("=== PPO Finetuned Hyperparameters ===\n")
    f.write(f"learning_rate = {best_model.learning_rate}\n")
    f.write(f"clip_range    = constant schedule 0.05\n")
    f.write(f"ent_coef      = {best_model.ent_coef}\n")
    f.write(f"n_steps       = {best_model.n_steps}\n")
    f.write(f"batch_size    = {best_model.batch_size}\n")
    f.write(f"timestamp     = {timestamp}\n")

# 3) Message confirmation
print("======================================")
print("Modèle PPO finetuné sauvegardé avec succès !")
print("Chemin :", MODEL_PATH)
print("Hyperparamètres enregistrés dans : ppo_final_model/hyperparams.txt")
print("======================================")


Modèle PPO finetuné sauvegardé avec succès !
Chemin : ppo_final_model\ppo_finetuned_20251211_1504.zip
Hyperparamètres enregistrés dans : ppo_final_model/hyperparams.txt
