## Ce code prédit les variables taux_occupation et temps de parcours avec un LSTM (prédiction de serie temporelle) avec variables explicatives :
## 'taux_occupation', 'temps_de_parcours', 'temperature_2m', 'precipitation', 'visibility','wind_speed_10m','has_event_near_troncon', 'weekday', 'is_vacances', 'is_ferie'

# Import librairies

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Import dataset

In [2]:
link = '/Users/manu/Desktop/SUP/Projet 2/AI_model_urban_mobility/data/df_final_15min_NoNan_20250505.parquet'

In [3]:
df_trafic = pd.read_parquet(link)

In [4]:
df_trafic.columns

Index(['nom_du_troncon', 'heure_arrondie', 'id_technique', 'id', 'debit',
       'longueur', 'taux_occupation', 'code_couleur', 'etat_du_trafic',
       'temps_de_parcours', 'vitesse', 'geo_point_2d', 'geometrie',
       'shape_geo', 'horodatage', 'type_geo', 'coordinates_geo',
       'horodatage_date', 'jour', 'is_vacances', 'is_ferie',
       'rounded_horodatage', 'date', 'temperature_2m', 'visibility',
       'precipitation', 'wind_speed_10m', 'gml_id', 'date_ech', 'code_qual',
       'lib_qual', 'coul_qual', 'date_dif', 'source', 'type_zone', 'code_zone',
       'lib_zone', 'code_no2', 'code_so2', 'code_o3', 'code_pm10', 'code_pm25',
       'x_wgs84', 'y_wgs84', 'x_reg', 'y_reg', 'epsg_reg', 'etat_indice',
       'geom_type', 'geom_coordinates', 'geo_point_2d_lon', 'geo_point_2d_lat',
       'has_event_near_troncon', 'weekday', 'hour', 'minute', 'troncon_enc'],
      dtype='object')

In [None]:
# --- Vérifie que CUDA est dispo ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# --- Chargement et tri du DataFrame ---
df = df_trafic.copy()
df['date_time'] = pd.to_datetime(df['heure_arrondie'])
df.sort_values(by=['troncon_enc', 'date_time'], inplace=True)

In [16]:
n_troncons = df['troncon_enc'].nunique()

In [6]:
# --- Features numériques à normaliser ---
features = ['taux_occupation', 'temps_de_parcours', 'temperature_2m', 'precipitation', 'visibility','wind_speed_10m',
            'has_event_near_troncon', 'weekday', 'is_vacances', 'is_ferie']
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

In [7]:
# --- Ajoute l'encodage du tronçon à chaque ligne comme feature supplémentaire ---
df['troncon_feature'] = df['troncon_enc']

In [8]:
# --- Split Train / Test ---
train_df = df[df['date_time'] < '2025-03-08']
test_df = df[df['date_time'] >= '2025-03-08']

In [9]:
# --- Création des séquences avec tronçon ---
SEQ_LEN = 96

In [11]:
def create_sequences(data, seq_len):
    xs, ys, tr_ids = [], [], []
    grouped = data.groupby('troncon_enc')
    
    for troncon_id, group in grouped:
        group = group.sort_values('heure_arrondie')
        feats = group[features].values
        for i in range(len(feats) - seq_len):
            x = feats[i:i+seq_len]
            y = feats[i+seq_len, :2]  # [taux_occupation, temps_de_parcours]
            xs.append(x)
            ys.append(y)
            tr_ids.append(troncon_id)  # tronçon associé à cette séquence

    return np.array(xs), np.array(ys), np.array(tr_ids)

X_train, y_train, tr_train = create_sequences(train_df, SEQ_LEN)
X_test, y_test, tr_test = create_sequences(test_df, SEQ_LEN)

In [13]:
# --- Dataset PyTorch ---
class TrafficDataset(Dataset):
    def __init__(self, X, y, troncon_ids):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.troncon_ids = torch.tensor(troncon_ids, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.troncon_ids[idx], self.y[idx]

train_dataset = TrafficDataset(X_train, y_train, tr_train)
test_dataset = TrafficDataset(X_test, y_test, tr_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [14]:
# --- Modèle LSTM avec Embedding tronçon ---
class LSTMWithTronconEmbedding(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim=64, output_dim=2, num_layers=2, n_troncons=1):
        super().__init__()
        self.embedding = nn.Embedding(n_troncons, embedding_dim)
        self.lstm = nn.LSTM(input_dim + embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, troncon_id):
        # x shape: (batch, seq_len, input_dim)
        emb = self.embedding(troncon_id)  # (batch, embedding_dim)
        emb_seq = emb.unsqueeze(1).expand(-1, x.size(1), -1)  # (batch, seq_len, embedding_dim)
        x_cat = torch.cat([x, emb_seq], dim=2)  # (batch, seq_len, input_dim + embedding_dim)
        out, _ = self.lstm(x_cat)
        out = out[:, -1, :]  # prend le dernier pas de temps
        out = self.fc(out)
        return out

In [17]:
# --- Initialisation ---
model = LSTMWithTronconEmbedding(
    input_dim=len(features),
    embedding_dim=8,
    hidden_dim=64,
    output_dim=2,
    num_layers=2,
    n_troncons=n_troncons
).to(device)

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

NameError: name 'device' is not defined

In [None]:
# --- Entraînement ---
EPOCHS = 20
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for X_batch, tr_ids, y_batch in train_loader:
        X_batch = X_batch.to(device)
        tr_ids = tr_ids.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        preds = model(X_batch, tr_ids)
        loss = loss_fn(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss / len(train_loader):.4f}")

In [None]:
# --- Évaluation ---
model.eval()
all_preds, all_targets, all_tr_enc = [], [], []

with torch.no_grad():
    for X_batch, tr_ids, y_batch in test_loader:
        X_batch = X_batch.to(device)
        tr_ids = tr_ids.to(device)
        preds = model(X_batch, tr_ids).cpu().numpy()
        
        all_preds.append(preds)
        all_targets.append(y_batch.numpy())
        all_tr_enc.append(tr_ids.cpu().numpy())

y_pred = np.concatenate(all_preds)
y_true = np.concatenate(all_targets)
tr_enc = np.concatenate(all_tr_enc)

# Calcul du MSE
mse = np.mean((y_true - y_pred)**2, axis=0)
print(f"\nTest MSE - taux_occupation: {mse[0]:.4f}, temps_de_parcours: {mse[1]:.4f}")

# --- Création des colonnes heure_arrondie et nom_du_troncon correspondantes ---

def get_seq_start_times(data, seq_len):
    times = []
    grouped = data.groupby('troncon_enc')
    for troncon_id, group in grouped:
        group = group.sort_values('heure_arrondie').reset_index(drop=True)
        for i in range(len(group) - seq_len):
            times.append(group.loc[i, 'heure_arrondie'])
    return times

def get_seq_troncon_names(data, seq_len):
    noms = []
    grouped = data.groupby('troncon_enc')
    for troncon_id, group in grouped:
        group = group.sort_values('heure_arrondie').reset_index(drop=True)
        for i in range(len(group) - seq_len):
            noms.append(group.loc[i, 'nom_du_troncon'])
    return noms

heure_arrondie_seq = get_seq_start_times(test_df, SEQ_LEN)
nom_du_troncon_seq = get_seq_troncon_names(test_df, SEQ_LEN)

# --- Création des DataFrames pour CSV ---

df_taux = pd.DataFrame({
    'heure_arrondie': heure_arrondie_seq,
    'nom_du_troncon': nom_du_troncon_seq,
    'taux_occupation_reel': y_true[:, 0],
    'taux_occupation_pred': y_pred[:, 0]
})

df_temps = pd.DataFrame({
    'heure_arrondie': heure_arrondie_seq,
    'nom_du_troncon': nom_du_troncon_seq,
    'temps_de_parcours_reel': y_true[:, 1],
    'temps_de_parcours_pred': y_pred[:, 1]
})

df_taux.to_csv('taux_occupation_predictions.csv', index=False)
df_temps.to_csv('temps_de_parcours_predictions.csv', index=False)

print("Fichiers CSV sauvegardés.")

In [None]:

# --- Reconstruire les tableaux si nécessaire ---
# y_true, y_pred doivent déjà être définis après l'évaluation (voir code précédent)

# --- Variable 1 : taux_occupation ---
plt.figure(figsize=(14, 4))
plt.plot(y_true[:, 0], label='Réel - taux_occupation', alpha=0.7)
plt.plot(y_pred[:, 0], label='Prédit - taux_occupation', alpha=0.7)
plt.title("Comparaison Réel vs Prédit : taux_occupation")
plt.xlabel("Échantillons (temps)")
plt.ylabel("Valeur normalisée")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Variable 2 : temps_de_parcours ---
plt.figure(figsize=(14, 4))
plt.plot(y_true[:, 1], label='Réel - temps_de_parcours', alpha=0.7)
plt.plot(y_pred[:, 1], label='Prédit - temps_de_parcours', alpha=0.7)
plt.title("Comparaison Réel vs Prédit : temps_de_parcours")
plt.xlabel("Échantillons (temps)")
plt.ylabel("Valeur normalisée")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
