# 📊 Modélisation du retard des trains avec RandomForest

## 1. 🧹 Chargement et nettoyage des données

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Chargement des données
df = pd.read_csv("cleaned_dataset.csv", sep=";")

## 2. 🏗️ Feature Engineering

In [None]:
df["total_delay_points"] = (
    df["trains_delayed_15min"]
    + 2 * df["trains_delayed_30min"]
    + 4 * df["trains_delayed_60min"]
)
df["quarter"] = pd.to_datetime(df["date"]).dt.quarter
df["is_major_arrival"] = (
    df["arrival_station"]
    .isin(df["arrival_station"].value_counts().head(10).index)
    .astype(int)
)

df["delay_ratio"] = df["avg_arr_delay"] / (df["avg_dep_delay"] + 0.1)

## 3. 📦 Définition des features et de la cible

In [None]:
features = [
    "route",
    "avg_dep_delay",
    "total_delay_points",
    "trains_delayed_30min",
    "trains_delayed_60min",
    "trains_delayed_15min",
    "cancelled_trains",
    "month",
    "delay_ratio",
    "quarter",
    "is_major_arrival",
    "pct_delay_external",
]

# Nettoyage des données
df = df.dropna(subset=["avg_arr_delay"] + features)
df = df[(df["avg_arr_delay"] >= 0) & (df["avg_arr_delay"] <= 30)]  # Filtre réaliste

# Préparation des données
X = df[features]
y = df["avg_arr_delay"]

## 4. ✂️ Séparation des données

In [66]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.14, random_state=42
)

## 5. 🧪 Prétraitement

In [67]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["route"]),
        (
            "num",
            StandardScaler(),
            [
                "avg_dep_delay",
                "total_delay_points",
                "trains_delayed_15min",
                "trains_delayed_30min",
                "trains_delayed_60min",
            ],
        ),
    ],
    remainder="passthrough",
)

## 6. 🌳 Création du modèle RandomForest

In [68]:
rf_model = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "regressor",
            RandomForestRegressor(
                n_estimators=300,
                max_depth=10,
                min_samples_split=5,
                max_features="sqrt",
                random_state=42,
                n_jobs=-1,
            ),
        ),
    ]
)

## 7. 🏋️ Entraînement du modèle

In [69]:
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

## 8. 🌳 Optimisation pour le modèle XGBoost

In [70]:
xgb_model = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "regressor",
            XGBRegressor(
                n_estimators=500,
                max_depth=5,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1,
            ),
        ),
    ]
)

## 9. 🏋️ Entraînement du modèle

In [71]:
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

## 10. 📍 Affichage des graphiques de résultat des modèles

In [None]:
# Création des DataFrames pour les visualisations
results_df = pd.DataFrame(
    {
        "month": X_test["month"],
        "real_delay": y_test,
        "rf_pred": y_pred_rf,
        "xgb_pred": y_pred_xgb,
    }
)

# Calcul des moyennes mensuelles
monthly_means = results_df.groupby("month").mean().reset_index()

# Visualisation pour RandomForest
plt.figure(figsize=(12, 6))
plt.plot(
    monthly_means["month"], monthly_means["real_delay"], "o-", label="Réel", linewidth=2
)
plt.plot(
    monthly_means["month"],
    monthly_means["rf_pred"],
    "s--",
    label="RandomForest",
    linewidth=2,
)
plt.xlabel("Mois")
plt.ylabel("Retard moyen (minutes)")
plt.title("Comparaison des retards réels et prédits (RandomForest) par mois")
plt.xticks(
    range(1, 13),
    [
        "Jan",
        "Fév",
        "Mar",
        "Avr",
        "Mai",
        "Jun",
        "Jul",
        "Aoû",
        "Sep",
        "Oct",
        "Nov",
        "Déc",
    ],
)
plt.grid(True, linestyle="--", alpha=0.7)
plt.legend()
plt.show()

# Visualisation pour XGBoost
plt.figure(figsize=(12, 6))
plt.plot(
    monthly_means["month"], monthly_means["real_delay"], "o-", label="Réel", linewidth=2
)
plt.plot(
    monthly_means["month"],
    monthly_means["xgb_pred"],
    "s--",
    label="XGBoost",
    linewidth=2,
)
plt.xlabel("Mois")
plt.ylabel("Retard moyen (minutes)")
plt.title("Comparaison des retards réels et prédits (XGBoost) par mois")
plt.xticks(
    range(1, 13),
    [
        "Jan",
        "Fév",
        "Mar",
        "Avr",
        "Mai",
        "Jun",
        "Jul",
        "Aoû",
        "Sep",
        "Oct",
        "Nov",
        "Déc",
    ],
)
plt.grid(True, linestyle="--", alpha=0.7)
plt.legend()
plt.show()

## 11. 📊 Évaluation des modèles

In [None]:
print("=== RandomForest ===")
print(f"R²: {r2_score(y_test, y_pred_rf):.3f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_rf):.2f} min")

print("\n=== XGBoost ===")
print(f"R²: {r2_score(y_test, y_pred_xgb):.3f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_xgb):.2f} min")

## 12. 💾 Sauvegarde du modèle

In [None]:
joblib.dump(xgb_model, "tardis_best_model.pkl")
print("\n✅ Modèles sauvegardés")