In [25]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [26]:
df_debit = pd.read_csv("../data/debit_predictions.csv")
df_taux = pd.read_csv("../data/taux_occupation_predictions.csv")
df_vitesse = pd.read_csv("../data/vitesse_prediction.csv")
df_temps = pd.read_csv("../data/temps_de_parcours_predictions.csv")

In [27]:
df_preds = df_debit.merge(df_taux, on=["heure_arrondie", "nom_du_troncon"])
df_preds = df_preds.merge(df_vitesse, on=["heure_arrondie", "nom_du_troncon"])
df_preds = df_preds.merge(df_temps, on=["heure_arrondie", "nom_du_troncon"])
df_preds.head()

Unnamed: 0,heure_arrondie,nom_du_troncon,debit_reel,debit_pred,taux_occupation_reel,taux_occupation_pred,vitesse_reelle,vitesse_predite,temps_de_parcours_reel,temps_de_parcours_pred
0,2025-03-08 06:00:00,3 Continents I,120.0,141.22934,-0.47788,-0.447675,20.552222,20.149572,0.432635,0.530357
1,2025-03-08 06:15:00,3 Continents I,120.0,136.64633,-0.47788,-0.451137,22.0,20.256884,0.401582,0.4984
2,2025-03-08 06:30:00,3 Continents I,120.0,140.86668,-0.413733,-0.475166,20.0,21.15414,0.401582,0.479862
3,2025-03-08 06:45:00,3 Continents I,60.0,141.61441,-0.47788,-0.451604,19.0,20.260792,0.45942,0.490813
4,2025-03-08 07:00:00,3 Continents I,60.0,84.411285,-0.606175,-0.467469,21.936,19.494596,0.416042,0.493972


In [28]:
df_preds.shape

(412473, 10)

In [29]:
# Chargement des données
df = pd.read_parquet("C:/Users/lisas/Downloads/df_final_15min_NoNan_20250505.parquet", engine="fastparquet")
df["heure_arrondie"] = pd.to_datetime(df["heure_arrondie"])
df = df.sort_values("heure_arrondie")

In [30]:
df_preds["heure_arrondie"] = pd.to_datetime(df_preds["heure_arrondie"])


In [31]:
df_train = df[df["heure_arrondie"] <= "2025-03-07"].copy()
df_test = df_preds.merge(df, on=["heure_arrondie", "nom_du_troncon"], how="left")

In [32]:
print(df_train.shape)
print(df_test.shape)

(1306256, 57)
(412473, 65)


In [36]:
print(df_train.columns)
print(df_test.columns)

Index(['nom_du_troncon', 'heure_arrondie', 'id_technique', 'id', 'debit',
       'longueur', 'taux_occupation', 'code_couleur', 'etat_du_trafic',
       'temps_de_parcours', 'vitesse', 'geo_point_2d', 'geometrie',
       'shape_geo', 'horodatage', 'type_geo', 'coordinates_geo',
       'horodatage_date', 'jour', 'is_vacances', 'is_ferie',
       'rounded_horodatage', 'date', 'temperature_2m', 'visibility',
       'precipitation', 'wind_speed_10m', 'gml_id', 'date_ech', 'code_qual',
       'lib_qual', 'coul_qual', 'date_dif', 'source', 'type_zone', 'code_zone',
       'lib_zone', 'code_no2', 'code_so2', 'code_o3', 'code_pm10', 'code_pm25',
       'x_wgs84', 'y_wgs84', 'x_reg', 'y_reg', 'epsg_reg', 'etat_indice',
       'geom_type', 'geom_coordinates', 'geo_point_2d_lon', 'geo_point_2d_lat',
       'has_event_near_troncon', 'weekday', 'hour', 'minute', 'troncon_enc',
       'etat_du_trafic_enc'],
      dtype='object')
Index(['heure_arrondie', 'nom_du_troncon', 'debit_reel', 'debit_pred',


In [33]:
features = ["debit_pred", "taux_occupation_pred", "vitesse_predite", "temps_de_parcours_pred", "is_vacances", "has_event_near_troncon"]
target = "etat_du_trafic"

In [34]:
# Encodage de la cible
le = LabelEncoder()
df_train["etat_du_trafic_enc"] = le.fit_transform(df_train["etat_du_trafic"])
df_test["etat_du_trafic_enc"] = le.transform(df_test["etat_du_trafic"])

In [41]:
features_train = ['debit', 'taux_occupation', 'vitesse', 'temps_de_parcours',
                  'hour', 'weekday', 'temperature_2m', 'visibility', 'precipitation', 'wind_speed_10m']

features_test = ['debit_pred', 'taux_occupation_pred', 'vitesse_predite', 'temps_de_parcours_pred',
                 'hour', 'weekday', 'temperature_2m', 'visibility', 'precipitation', 'wind_speed_10m']


In [42]:
print(df_test[features_test].isnull().sum())

debit_pred                0
taux_occupation_pred      0
vitesse_predite           0
temps_de_parcours_pred    0
hour                      0
weekday                   0
temperature_2m            0
visibility                0
precipitation             0
wind_speed_10m            0
dtype: int64


In [44]:
# Normalisation
scaler_train = StandardScaler()
scaler_test = StandardScaler()

X_train = scaler_train.fit_transform(df_train[features_train])
X_test = scaler_test.fit_transform(df_test[features_test]) 

In [45]:
y_train = df_train["etat_du_trafic_enc"]
y_test = df_test["etat_du_trafic_enc"]

In [46]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [47]:

# Prédiction
y_pred = clf.predict(X_test)

In [48]:
print("\nClassification Report :")
print(classification_report(y_test, y_pred, target_names=le.classes_))



Classification Report :
              precision    recall  f1-score   support

      Bloqué       0.32      0.22      0.26      2654
       Dense       0.14      0.17      0.15      4459
      Fluide       0.99      0.99      0.99    403236
      Saturé       0.12      0.13      0.13      2124

    accuracy                           0.97    412473
   macro avg       0.39      0.38      0.38    412473
weighted avg       0.97      0.97      0.97    412473



In [49]:
pourcentages = df["etat_du_trafic"].value_counts(normalize=True) * 100
print(pourcentages)


etat_du_trafic
Fluide    97.652146
Dense      1.163213
Bloqué     0.658305
Saturé     0.526336
Name: proportion, dtype: float64


SMOTE n’est pas suffisant

Les classes minoritaires (Dense, Bloqué, Saturé) sont trop peu représentées.

SMOTE risque de générer des données artificielles peu fiables, voire d’aggraver l’overfitting

In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [55]:
df_test_bal = df_test.copy()

# Séparation des classes
df_fluide = df_test_bal[df_test_bal["etat_du_trafic"] == "Fluide"]
df_minoritaires = df_test_bal[df_test_bal["etat_du_trafic"] != "Fluide"]

# Sous-échantillonnage de la classe majoritaire
n_samples = min(len(df_minoritaires), 3 * len(df_minoritaires))  # ou fixe, ex: 10000
df_fluide_down = resample(df_fluide, replace=False, n_samples=n_samples, random_state=42)

# Fusionner les données équilibrées
df_test_balanced = pd.concat([df_fluide_down, df_minoritaires])


In [57]:
# Features
features_train = ['debit', 'taux_occupation', 'vitesse', 'temps_de_parcours',
                  'hour', 'weekday', 'temperature_2m', 'visibility', 'precipitation', 'wind_speed_10m']

features_test = ['debit_pred', 'taux_occupation_pred', 'vitesse_predite', 'temps_de_parcours_pred',
                 'hour', 'weekday', 'temperature_2m', 'visibility', 'precipitation', 'wind_speed_10m']

target = 'etat_du_trafic'

# Encodage de la cible
le = LabelEncoder()
df_train_balanced["etat_du_trafic_enc"] = le.fit_transform(df_train_balanced[target])
df_test["etat_du_trafic_enc"] = le.transform(df_test[target])



# Normalisation
scaler_train = StandardScaler()
scaler_test = StandardScaler()

X_train = scaler_train.fit_transform(df_train_balanced[features_train])
X_test = scaler_test.fit_transform(df_train_balanced[features_test]) 

X_test = scaler.transform(df_test[features])
y_test = df_test["etat_du_trafic_enc"]

# Modèle
clf = LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42)
clf.fit(X_train, y_train)


KeyError: "['debit_pred', 'taux_occupation_pred', 'vitesse_predite', 'temps_de_parcours_pred'] not in index"