In [1]:
# Cellule 1 — Imports et paramètres globaux
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sb3_contrib import MaskablePPO
from env.workshop_env import WorkshopEnv

SEED = 42
MAX_STEPS = 10080        # 1 semaine = 7 jours
BEST_MODEL_PATH = "./ppo_safe_best/best_model.zip"

np.random.seed(SEED)


In [2]:
# Cellule 2 — Chargement du modèle PPO entraîné

ppo = MaskablePPO.load(BEST_MODEL_PATH, device="cpu")
print(" Modèle PPO chargé :", BEST_MODEL_PATH)


 Modèle PPO chargé : ./ppo_safe_best/best_model.zip


In [3]:
# Cellule 3 — Politique experte

def expert_policy(obs: np.ndarray, env: WorkshopEnv) -> int:

    time = float(obs[0]) * env.max_time

    m1_busy = int(round(obs[1]))
    m2_busy = int(round(obs[3]))

    stock_raw = obs[5] * env.raw_capacity
    stock_p2_inter = obs[7] * env.raw_capacity

    backlog_p1 = obs[10] * 1000
    backlog_p2 = obs[11] * 1000
    backlog_total = backlog_p1 + backlog_p2

    def choose_k(backlog, k_max=5):
        if backlog <= 5: return 1
        elif backlog <= 15: return 2
        elif backlog <= 30: return 3
        elif backlog <= 60: return 4
        else: return min(5, int(k_max))

    # Commande MP
    target_raw = min(env.raw_capacity, backlog_total + 10)
    current_pipeline = stock_raw + obs[12] * 1000
    missing = target_raw - current_pipeline

    if missing > 0:
        return 149 + int(max(1, min(50, missing)))

    # P2 STEP2
    if not m2_busy and stock_p2_inter > 0:
        return 99 + choose_k(backlog_p2, min(5, stock_p2_inter))

    # P2 STEP1
    if not m1_busy and backlog_p2 > 0 and stock_raw > 0:
        return 49 + choose_k(backlog_p2, min(5, stock_raw))

    # P1
    if not m1_busy and backlog_p1 > 0 and stock_raw > 0:
        return choose_k(backlog_p1, min(5, stock_raw)) - 1

    return 200


In [4]:
# Cellule 4 — Politique experte avec masque d’actions

def expert_policy_masked(env: WorkshopEnv, obs: np.ndarray) -> int:
    mask = env.get_action_mask().astype(bool)
    a = expert_policy(obs, env)

    if not isinstance(a, (int, np.integer)) or a < 0 or a >= len(mask):
        return 200

    if not mask[a]:
        return 200 if mask[200] else int(np.where(mask)[0][0])

    return int(a)


In [5]:
# Cellule 5 — Décodage lisible de l’observation

def decode_obs(obs: np.ndarray, env: WorkshopEnv) -> dict:
    return {
        "time_min": int(obs[0] * env.max_time),
        "minute_of_day": int(obs[16] * 1440),

        "m1_busy": int(round(obs[1])),
        "m2_busy": int(round(obs[3])),

        "stock_raw": round(obs[5] * env.raw_capacity, 1),
        "stock_p1": round(obs[6] * env.raw_capacity, 1),
        "stock_p2_inter": round(obs[7] * env.raw_capacity, 1),
        "stock_p2": round(obs[8] * env.raw_capacity, 1),

        "backlog_p1": round(obs[10] * 1000, 1),
        "backlog_p2": round(obs[11] * 1000, 1),

        "raw_incoming": round(obs[12] * 1000, 1),
        "time_to_next_sell": int(obs[22] * 15),

        "reward_last_step": round(obs[21] * 100_000, 2),
        "reward_week": round(obs[20] * 1_000_000, 2),
    }


In [6]:
# Cellule 6 — Décodage lisible des actions

def decode_action(a: int):
    if a == 200:
        return ("WAIT", 0)
    if 0 <= a <= 49:
        return ("PROD_P1", a + 1)
    if 50 <= a <= 99:
        return ("PROD_P2_STEP1", a - 49)
    if 100 <= a <= 149:
        return ("PROD_P2_STEP2", a - 99)
    if 150 <= a <= 199:
        return ("ORDER_RAW", a - 149)
    return ("UNKNOWN", -1)


In [7]:
# Cellule 7 — Policy PPO masquée

# Interface propre pour appeler le PPO avec masque d’actions.

def ppo_policy_masked(env: WorkshopEnv, obs: np.ndarray) -> int:
    mask = env.get_action_mask()
    action, _ = ppo.predict(obs, deterministic=True, action_masks=mask)
    return int(action)


In [8]:
# Cellule 8 — Runner de simulation contextualisé

# Exécute une semaine et enregistre chaque décision avec son état complet.

def run_and_trace(policy_name, policy_fn):
    env = WorkshopEnv()
    obs, _ = env.reset(seed=SEED)

    rows = []
    total_reward = 0.0

    for t in range(MAX_STEPS):
        decoded = decode_obs(obs, env)
        mask = env.get_action_mask()

        action = policy_fn(env, obs)
        action_type, k = decode_action(action)

        obs, r, terminated, truncated, _ = env.step(action)
        total_reward += r

        rows.append({
            "policy": policy_name,
            "t": t,
            "action": action,
            "action_type": action_type,
            "k": k,
            "valid": int(mask[action]),
            "reward": r,
            "total_reward": total_reward,
            **decoded
        })

        if terminated or truncated:
            break

    return pd.DataFrame(rows)


In [9]:
# Cellule 9 — Exécution expert vs PPO

# On simule exactement la même semaine pour l’expert et pour PPO.

df_expert = run_and_trace("expert", expert_policy_masked)
df_ppo = run_and_trace("ppo", ppo_policy_masked)

print("Reward expert :", df_expert["total_reward"].iloc[-1])
print("Reward PPO    :", df_ppo["total_reward"].iloc[-1])


Reward expert : 12916.319999998537
Reward PPO    : 13113.059999998488


In [10]:
# Cellule 10 — Fusion et comparaison pas à pas

cmp = df_expert.merge(
    df_ppo,
    on="t",
    suffixes=("_expert", "_ppo")
)

cmp["same_action"] = cmp["action_expert"] == cmp["action_ppo"]
cmp.head(10)


Unnamed: 0,policy_expert,t,action_expert,action_type_expert,k_expert,valid_expert,reward_expert,total_reward_expert,time_min_expert,minute_of_day_expert,...,stock_p1_ppo,stock_p2_inter_ppo,stock_p2_ppo,backlog_p1_ppo,backlog_p2_ppo,raw_incoming_ppo,time_to_next_sell_ppo,reward_last_step_ppo,reward_week_ppo,same_action
0,expert,0,200,WAIT,0,1,-0.2,-0.2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,False
1,expert,1,200,WAIT,0,1,-0.2,-0.4,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,14,20.0,20.0,True
2,expert,2,200,WAIT,0,1,-0.2,-0.6,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,13,-0.2,19.799999,True
3,expert,3,200,WAIT,0,1,-0.2,-0.8,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,12,-0.2,19.6,True
4,expert,4,200,WAIT,0,1,-0.2,-1.0,4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,11,-0.2,19.4,True
5,expert,5,200,WAIT,0,1,-0.2,-1.2,5,5,...,0.0,0.0,0.0,0.0,0.0,0.0,10,-0.2,19.200001,True
6,expert,6,200,WAIT,0,1,-0.2,-1.4,6,6,...,0.0,0.0,0.0,0.0,0.0,0.0,9,-0.2,19.0,True
7,expert,7,200,WAIT,0,1,-0.2,-1.6,7,7,...,0.0,0.0,0.0,0.0,0.0,0.0,8,-0.2,18.799999,True
8,expert,8,200,WAIT,0,1,-0.2,-1.8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,7,-0.2,18.6,True
9,expert,9,200,WAIT,0,1,-0.2,-2.0,9,9,...,0.0,0.0,0.0,0.0,0.0,0.0,6,-0.2,18.4,True


In [11]:
# Cellule 11 — Analyse des divergences

cmp[~cmp["same_action"]].head(20)


Unnamed: 0,policy_expert,t,action_expert,action_type_expert,k_expert,valid_expert,reward_expert,total_reward_expert,time_min_expert,minute_of_day_expert,...,stock_p1_ppo,stock_p2_inter_ppo,stock_p2_ppo,backlog_p1_ppo,backlog_p2_ppo,raw_incoming_ppo,time_to_next_sell_ppo,reward_last_step_ppo,reward_week_ppo,same_action
0,expert,0,200,WAIT,0,1,-0.2,-0.2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,False
10,expert,10,200,WAIT,0,1,-0.2,-2.2,10,10,...,0.0,1.0,0.0,0.0,0.0,0.0,5,-0.2,18.200001,False
25,expert,25,200,WAIT,0,1,-0.2,-5.2,25,25,...,0.0,1.0,1.0,0.0,0.0,0.0,5,-0.2,30.4,False
30,expert,30,150,ORDER_RAW,1,1,-1.0,-7.02,30,30,...,0.0,1.0,1.0,0.0,0.0,0.0,0,-0.2,44.599998,False
31,expert,31,50,PROD_P2_STEP1,1,1,5.0,-2.02,31,31,...,0.0,1.0,1.0,0.0,0.0,0.0,14,-0.2,44.400002,False
32,expert,32,150,ORDER_RAW,1,1,-1.0,-3.02,32,32,...,0.0,1.0,1.0,0.0,0.0,0.0,13,-0.2,44.200001,False
40,expert,40,200,WAIT,0,1,-0.2,-4.62,40,40,...,0.0,2.0,2.0,0.0,0.0,0.0,5,-0.2,42.599998,False
41,expert,41,100,PROD_P2_STEP2,1,1,15.0,10.38,41,41,...,0.0,1.0,2.0,0.0,0.0,0.0,4,15.0,57.599998,False
42,expert,42,50,PROD_P2_STEP1,1,1,5.0,15.38,42,42,...,0.0,1.0,2.0,0.0,0.0,0.0,3,1.0,58.599998,False
43,expert,43,150,ORDER_RAW,1,1,-1.0,14.38,43,43,...,0.0,1.0,2.0,0.0,0.0,0.0,2,-0.2,58.400002,False


In [12]:
# Cellule 12 (SAFE) — Export CSV + petit résumé (sans plot)

print("Rows df_expert:", len(df_expert), "Rows df_ppo:", len(df_ppo), "Rows cmp:", len(cmp))
print("Columns cmp:", len(cmp.columns))

print("\nAperçu colonnes stocks/backlogs (expert):")
cols_check = ["stock_raw_expert","stock_p1_expert","stock_p2_inter_expert","stock_p2_expert",
              "backlog_p1_expert","backlog_p2_expert","raw_incoming_expert"]
print([c for c in cols_check if c in cmp.columns])

# Export CSV
cmp.to_csv("cmp_expert_vs_ppo.csv", index=False)
print("\n✓ CSV écrit : cmp_expert_vs_ppo.csv")


Rows df_expert: 10080 Rows df_ppo: 10080 Rows cmp: 10080
Columns cmp: 44

Aperçu colonnes stocks/backlogs (expert):
['stock_raw_expert', 'stock_p1_expert', 'stock_p2_inter_expert', 'stock_p2_expert', 'backlog_p1_expert', 'backlog_p2_expert', 'raw_incoming_expert']

✓ CSV écrit : cmp_expert_vs_ppo.csv


In [13]:
# Cellule 13 (SAFE) — Zoom lisible sur la 1ère divergence (sans explosion affichage)

div = cmp.loc[~cmp["same_action"]].head(1).copy()

cols = [
    # identifiants
    "t",

    # actions
    "action_expert", "action_type_expert", "k_expert",
    "action_ppo",    "action_type_ppo",    "k_ppo",

    # stocks/backlogs (EXPERT)
    "stock_raw_expert", "stock_p1_expert", "stock_p2_inter_expert", "stock_p2_expert",
    "backlog_p1_expert", "backlog_p2_expert",
    "raw_incoming_expert", "time_to_next_sell_expert",

    # machines (EXPERT)
    "m1_busy_expert", "m2_busy_expert",

    # rewards
    "reward_expert", "reward_ppo",
    "total_reward_expert", "total_reward_ppo"
]

# On garde uniquement les colonnes qui existent (robuste si tu changes des noms)
cols = [c for c in cols if c in div.columns]

div[cols]


Unnamed: 0,t,action_expert,action_type_expert,k_expert,action_ppo,action_type_ppo,k_ppo,stock_raw_expert,stock_p1_expert,stock_p2_inter_expert,...,backlog_p1_expert,backlog_p2_expert,raw_incoming_expert,time_to_next_sell_expert,m1_busy_expert,m2_busy_expert,reward_expert,reward_ppo,total_reward_expert,total_reward_ppo
0,0,200,WAIT,0,53,PROD_P2_STEP1,4,10.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,-0.2,20.0,-0.2,20.0


In [14]:
# Cellule 14 — Export divergences (seulement)

divergences = cmp[~cmp["same_action"]].copy()
divergences.to_csv("cmp_divergences_only.csv", index=False)

print("✓ CSV divergences écrit : cmp_divergences_only.csv")
print("Nb divergences :", len(divergences))


✓ CSV divergences écrit : cmp_divergences_only.csv
Nb divergences : 1961
