# Recherche Predictive - Vision 2026 (Explication des Performances)

Ce notebook clarifie la difference entre la **MAE de 0.00** observee dans le script d'entrainement et la **MAE plus elevee** observee lors de la prediction du mois de decembre.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import joblib
import os

In [None]:
# 1. Chargement et Feature Engineering
df_adm = pd.read_csv('../data/raw/admissions_hopital_pitie_2024.csv')
df_adm['date_entree'] = pd.to_datetime(df_adm['date_entree'])
daily_data = df_adm.groupby('date_entree').size().rename('admissions').reset_index()
daily_data = daily_data.set_index('date_entree').asfreq('D', fill_value=0)

def create_features(df):
    df = df.copy()
    df['month_sin'] = np.sin(2 * np.pi * df.index.month / 12)
    df['month_cos'] = np.cos(2 * np.pi * df.index.month / 12)
    df['day_sin'] = np.sin(2 * np.pi * df.index.dayofweek / 7)
    df['day_cos'] = np.cos(2 * np.pi * df.index.dayofweek / 7)
    
    holidays = ['2024-01-01', '2024-04-01', '2024-05-01', '2024-05-08', 
                '2024-05-09', '2024-05-20', '2024-07-14', '2024-08-15', 
                '2024-11-01', '2024-11-11', '2024-12-25']
    holiday_dates = pd.to_datetime(holidays)
    df['is_holiday'] = df.index.strftime('%Y-%m-%d').isin(holidays).astype(int)
    df['days_to_holiday'] = [(holiday_dates[holiday_dates >= d].min() - d).days if any(holiday_dates >= d) else 365 for d in df.index]
    
    df['dayofyear'] = df.index.dayofyear
    df['dayofyear_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365)
    df['weekofyear'] = df.index.isocalendar().week.astype(int)
    df['dayofmonth'] = df.index.day
    
    df['lag1'] = df['admissions'].shift(1)
    df['lag2'] = df['admissions'].shift(2)
    df['lag7'] = df['admissions'].shift(7)
    df['lag14'] = df['admissions'].shift(14)
    
    df['roll_mean_3'] = df['admissions'].shift(1).rolling(window=3).mean()
    df['roll_mean_7'] = df['admissions'].shift(1).rolling(window=7).mean()
    df['roll_max_7'] = df['admissions'].shift(1).rolling(window=7).max()
    df['roll_min_7'] = df['admissions'].shift(1).rolling(window=7).min()
    df['roll_std_7'] = df['admissions'].shift(1).rolling(window=7).std()
    
    return df.dropna()

features_df = create_features(daily_data)
FEATURES = ['month_sin', 'month_cos', 'day_sin', 'day_cos', 'is_holiday', 'days_to_holiday', 
            'dayofyear_sin', 'dayofyear_cos', 'weekofyear', 'dayofmonth', 'lag1', 'lag2', 'lag7', 'lag14', 
            'roll_mean_3', 'roll_mean_7', 'roll_max_7', 'roll_min_7', 'roll_std_7']
TARGET = 'admissions'

train = features_df.iloc[:-30]
test = features_df.iloc[-30:]
X_train, y_train = train[FEATURES], train[TARGET]
X_test, y_test = test[FEATURES], test[TARGET]

## 2. Optimisation via GridSearchCV
Nous entrainons le modele sur les donnees 'Train' (Janvier-Novembre).

In [None]:
param_grid = {
    'n_estimators': [2000],
    'learning_rate': [0.01, 0.05],
    'max_depth': [12, 15],
    'objective': ['reg:squarederror']
}

tscv = TimeSeriesSplit(n_splits=3)
grid = GridSearchCV(xgb.XGBRegressor(), param_grid, cv=tscv, scoring='neg_mean_absolute_error', verbose=1)
grid.fit(X_train, y_train)
model = grid.best_estimator_

## 3. Train MAE (Memorisation) vs Test MAE (Prediction)

C'est ici que se trouve la reponse a votre question :
- **Train MAE** : Mesure a quel point le modele a memorise le passe. Avec une profondeur de 12, il atteint **~0.00**.
- **Test MAE** : Mesure a quel point le modele peut predire le mois de Decembre qu'il n'a jamais vu.

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print(f"MAE sur TRAIN (Memorisation) : {mean_absolute_error(y_train, y_train_pred):.4f}")
print(f"MAE sur TEST (Decembre Unseen) : {mean_absolute_error(y_test, y_test_pred):.2f}")

# Preuve visuelle du match parfait sur le Train
train_sample = train.tail(30)
fig_train = go.Figure()
fig_train.add_trace(go.Scatter(x=train_sample.index, y=y_train[-30:], name='Reel (Jan-Nov)', line=dict(color='gray')))
fig_train.add_trace(go.Scatter(x=train_sample.index, y=y_train_pred[-30:], name='Pred (Memorisation)', line=dict(color='cyan', dash='dash')))
fig_train.update_layout(title='Verification Memorisation : MAE quasi nulle', template='plotly_dark')
fig_train.show()

# Realite de la prediction sur Decembre
fig_test = go.Figure()
fig_test.add_trace(go.Scatter(x=test.index, y=y_test, name='Reel (Decembre)', line=dict(color='#005ba1', width=3)))
fig_test.add_trace(go.Scatter(x=test.index, y=y_test_pred, name='Prediction (Unseen)', line=dict(color='#c8102e', dash='dot', width=3)))
fig_test.update_layout(title='Realite : Performance sur le Futur Unseen', template='plotly_dark')
fig_test.show()