# Recherche Predictive - Vision 2026 (Version Hybride Optimisee)

Ce notebook presente l'approche hybride **Linear Regression + XGBoost** avec une etape cruciale de **GridSearchCV** pour l'optimisation des hyperparametres.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import joblib
import os

In [None]:
# Data preparation
df_adm = pd.read_csv('../data/raw/admissions_hopital_pitie_2024.csv')
df_adm['date_entree'] = pd.to_datetime(df_adm['date_entree'])
daily_data = df_adm.groupby('date_entree').size().rename('admissions').reset_index()
daily_data = daily_data.set_index('date_entree').asfreq('D', fill_value=0)

def create_features(df):
    df = df.copy()
    df['month_sin'] = np.sin(2 * np.pi * df.index.month / 12)
    df['month_cos'] = np.cos(2 * np.pi * df.index.month / 12)
    df['day_sin'] = np.sin(2 * np.pi * df.index.dayofweek / 7)
    df['day_cos'] = np.cos(2 * np.pi * df.index.dayofweek / 7)
    df['dayofyear'] = df.index.dayofyear
    df['weekofyear'] = df.index.isocalendar().week.astype(int)
    df['lag1'] = df['admissions'].shift(1)
    df['lag7'] = df['admissions'].shift(7)
    df['lag14'] = df['admissions'].shift(14)
    df['roll_mean_7'] = df['admissions'].shift(1).rolling(window=7).mean()
    df['roll_std_7'] = df['admissions'].shift(1).rolling(window=7).std()
    return df.dropna()

features_df = create_features(daily_data)
FEATURES = ['month_sin', 'month_cos', 'day_sin', 'day_cos', 'dayofyear', 
            'weekofyear', 'lag1', 'lag7', 'lag14', 'roll_mean_7', 'roll_std_7']
TARGET = 'admissions'

train = features_df.iloc[:-30]
test = features_df.iloc[-30:]
X_train, y_train = train[FEATURES], train[TARGET]
X_test, y_test = test[FEATURES], test[TARGET]

## 1. Baseline Lineaire
On definit la tendance et le niveau de base.

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
residuals_train = y_train - lr.predict(X_train)

## 2. Optimisation Hyperparametres (Grid Search)
Recherche des meilleurs reglages pour l'XGBoost sur les residus.

In [None]:
param_grid = {
    'n_estimators': [300, 700],
    'learning_rate': [0.03, 0.05],
    'max_depth': [4, 6],
    'subsample': [0.8, 1.0]
}

tscv = TimeSeriesSplit(n_splits=3)
grid = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror'), param_grid, cv=tscv, scoring='neg_mean_absolute_error')
grid.fit(X_train, residuals_train)

print(f"Meilleurs parametres : {grid.best_params_}")
best_xgb = grid.best_estimator_

## 3. Evaluation Finale
Combinaison de la tendance et des residus optimises.

In [None]:
y_pred = lr.predict(X_test) + best_xgb.predict(X_test)
print(f"MAE Finale : {mean_absolute_error(y_test, y_pred):.2f}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=test.index, y=y_test, name='Donnees Reelles', line=dict(color='#005ba1')))
fig.add_trace(go.Scatter(x=test.index, y=y_pred, name='Hybride Optimise (LR+XGB)', line=dict(color='#c8102e', dash='dash')))
fig.update_layout(title='Prediction Optimisee via GridSearch', template='plotly_dark')
fig.show()