# LightGBM 

In [17]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

In [18]:
# 1. Feature Engineering "Super-Signal"
df_adm = pd.read_csv('../data/raw/admissions_hopital_pitie_2024.csv')
df_adm['date_entree'] = pd.to_datetime(df_adm['date_entree'])
dd = df_adm.groupby('date_entree').size().rename('admissions').asfreq('D', fill_value=0)

def create_super_features(df_ts):
    df = pd.DataFrame(index=df_ts.index)
    df['admissions'] = df_ts.values
    
    # Fourier Transforms (Saisonnalité Complexe)
    for k in range(1, 4): # 3 premiers harmoniques
        df[f'fourier_sin_{k}'] = np.sin(2 * np.pi * k * df.index.dayofyear / 365.25)
        df[f'fourier_cos_{k}'] = np.cos(2 * np.pi * k * df.index.dayofyear / 365.25)
    
    # Momentum (EWMA)
    df['ewma_7'] = df['admissions'].shift(1).ewm(span=7).mean()
    df['ewma_28'] = df['admissions'].shift(1).ewm(span=28).mean()
    
    # Saisonnier Metier
    df['month'] = df.index.month
    df['is_winter'] = df['month'].isin([11, 12, 1, 2]).astype(int)
    df['day'] = df.index.dayofweek
    
    # Lags dynamiques
    for l in [1, 2, 7, 14, 21]:
        df[f'lag_{l}'] = df['admissions'].shift(l)
    
    return df.dropna()

full_df = create_super_features(dd)
X = full_df.drop('admissions', axis=1)
y = full_df['admissions']

# Test sur les 3 derniers mois
test_days = 90
X_train, X_test = X.iloc[:-test_days], X.iloc[-test_days:]
y_train, y_test = y.iloc[:-test_days], y.iloc[-test_days:]

In [None]:
# 2. Tuning avec Booster 'DART' (Généralement plus précis pour les séries temporelles)
param_dist = {
    'num_leaves': [31, 63, 127, 255],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [1000, 2000],
    'feature_fraction': [0.7, 0.8, 0.9],
    'bagging_fraction': [0.8],
    'bagging_freq': [5],
    'lambda_l1': [0, 0.1, 1],
    'lambda_l2': [0, 0.1, 1]
}

tscv = TimeSeriesSplit(n_splits=5)
rs = RandomizedSearchCV(
    lgb.LGBMRegressor(objective='regression_l1', booster='dart', random_state=42, verbose=-1, n_jobs=-1),
    param_distributions=param_dist,
    n_iter=30,
    cv=tscv,
    scoring='neg_mean_absolute_error'
)

print("Recherche du modèle ultime en cours...")
rs.fit(X_train, y_train)
best_lgbm = rs.best_estimator_

Recherche du modèle ultime en cours...


In [None]:
preds = best_lgbm.predict(X_test)
mae = mean_absolute_error(y_test, preds)
print(f"\nMAE FINAL (Ultra-Optimisé) : {mae:.2f}")

import matplotlib.pyplot as plt
plt.figure(figsize=(15, 6))
plt.plot(y_test.index, y_test, label='Réel', color='#1a3a5f', alpha=0.5)
plt.plot(y_test.index, preds, label='LightGBM DART + Fourier', color='#c8102e', linewidth=2)
plt.title(f'Vision 2026 : Précision Maximale (MAE: {mae:.2f})')
plt.legend()
plt.show()