# Recherche Predictive - Vision 2026 (LightGBM Ultra-Tuning)


In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import joblib
import os

In [2]:
# Chargement et Feature Engineering Avance
df_adm = pd.read_csv('../data/raw/admissions_hopital_pitie_2024.csv')
df_adm['date_entree'] = pd.to_datetime(df_adm['date_entree'])
daily_data = df_adm.groupby('date_entree').size().rename('admissions').reset_index()
daily_data = daily_data.set_index('date_entree').asfreq('D', fill_value=0)

def create_features_advanced(df):
    df = df.copy()
    
    # Cycliques de base
    df['month_sin'] = np.sin(2 * np.pi * df.index.month / 12)
    df['month_cos'] = np.cos(2 * np.pi * df.index.month / 12)
    df['day_sin'] = np.sin(2 * np.pi * df.index.dayofweek / 7)
    df['day_cos'] = np.cos(2 * np.pi * df.index.dayofweek / 7)
    
    # Holidays (France 2024)
    holidays = ['2024-01-01', '2024-04-01', '2024-05-01', '2024-05-08', 
                '2024-05-09', '2024-05-20', '2024-07-14', '2024-08-15', 
                '2024-11-01', '2024-11-11', '2024-12-25']
    holiday_dates = pd.to_datetime(holidays)
    df['is_holiday'] = df.index.strftime('%Y-%m-%d').isin(holidays).astype(int)
    df['days_to_holiday'] = [(holiday_dates[holiday_dates >= d].min() - d).days if any(holiday_dates >= d) else 365 for d in df.index]
    
    # Calendrier etendu
    df['dayofyear_sin'] = np.sin(2 * np.pi * df.index.dayofyear / 365)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df.index.dayofyear / 365)
    df['weekofyear'] = df.index.isocalendar().week.astype(int)
    df['dayofmonth'] = df.index.day
    df['quarter'] = df.index.quarter
    df['is_month_start'] = df.index.is_month_start.astype(int)
    df['is_month_end'] = df.index.is_month_end.astype(int)
    
    # Lags (Spectre large)
    for l in [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]:
        df[f'lag{l}'] = df['admissions'].shift(l)
    
    # Rolling Stats (Context larges)
    for w in [3, 7, 14, 28]:
        df[f'roll_mean_{w}'] = df['admissions'].shift(1).rolling(window=w).mean()
        df[f'roll_std_{w}'] = df['admissions'].shift(1).rolling(window=w).std()
        df[f'roll_max_{w}'] = df['admissions'].shift(1).rolling(window=w).max()
    
    return df.dropna()

features_df = create_features_advanced(daily_data)

TARGET = 'admissions'
FEATURES = [c for c in features_df.columns if c != TARGET]

# Split pro (December for test)
train_df = features_df.iloc[:-30]
test_df = features_df.iloc[-30:]
X_train, y_train = train_df[FEATURES], train_df[TARGET]
X_test, y_test = test_df[FEATURES], test_df[TARGET]

print(f"Dimensions Train: {X_train.shape}, Test: {X_test.shape}")

Dimensions Train: (307, 35), Test: (30, 35)


## 2. GridSearch Profond (Objectif MAE)
L'utilisation de `objective='regression_l1'` est cruciale car elle force LightGBM a minimiser directement l'ecart absolu.

In [None]:
param_grid = {
    'n_estimators': [2000, 3000],
    'learning_rate': [0.005, 0.01],
    'num_leaves': [63, 127],
    'max_depth': [-1,20],
    'min_child_samples': [5,20],
    'feature_fraction': [0.8, 0.9]
}

# Note: Cela peut prendre du temps 
tscv = TimeSeriesSplit(n_splits=5)
grid = GridSearchCV(
    lgb.LGBMRegressor(objective='regression_l1', random_state=42, verbose=-1), 
    param_grid, 
    cv=tscv, 
    scoring='neg_mean_absolute_error', 
    verbose=1, 
    n_jobs=-1
)

print("Execution du tuning intensif...")
grid.fit(X_train, y_train)
best_lgb = grid.best_estimator_

Execution du tuning intensif...
Fitting 5 folds for each of 64 candidates, totalling 320 fits


## 3. Evaluation Finale (Objectif MAE ~ 10)
Voyons si nous pouvons briser le plafond des 90 MAE.

In [None]:
y_pred = best_lgb.predict(X_test)
mae_val = -grid.best_score_
mae_test = mean_absolute_error(y_test, y_pred)

print(f"Points de Tuning Trouves : {grid.best_params_}")
print(f"MAE de Validation (Fold Moyen) : {mae_val:.2f}")
print(f"MAE FINALE SUR TEST (Decembre) : {mae_test:.2f}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=test_df.index, y=y_test, name='Donnees Reelles', line=dict(color='#005ba1', width=3)))
fig.add_trace(go.Scatter(x=test_df.index, y=y_pred, name='LightGBM Ultra-Tuned', line=dict(color='#ff7f0e', dash='dot', width=3)))
fig.update_layout(title='Performance LightGBM : Objectif MAE Minimal', template='plotly_dark')
fig.show()


In [None]:

# Importance des variables pour comprendre les leviers de performance
importances = pd.DataFrame({'feature': FEATURES, 'importance': best_lgb.feature_importances_})
importances = importances.sort_values('importance', ascending=False).head(15)
fig_imp = px.bar(importances, x='importance', y='feature', orientation='h', title='Top 15 Features Impactantes')
fig_imp.update_layout(template='plotly_dark')
fig_imp.show()