In [279]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error,mean_squared_error
from skforecast.datasets import fetch_dataset

# Importation des modeles
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb

from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursiveMultiSeries
from skforecast.model_selection import (
    TimeSeriesFold,
    backtesting_forecaster_multiseries,
    grid_search_forecaster_multiseries,
    bayesian_search_forecaster_multiseries
)
from skforecast.plot import set_dark_theme

In [280]:
# Constantes
TEST_SIZE = 24
VAL_SIZE = 12
LAG_SIZE = 6
WINDOW_SIZE = 4

In [281]:
# Liste des modèles à comparer
models = {
    'XGBoost': {
        "alg": xgb.XGBRegressor(objective='reg:squarederror', verbosity=0),
        "params_grid_search": {
            "eta": [0.1, 0.3],
            "n_estimators": [50, 100]
        }
    },
    "RegressionLineaire": {
        "alg": LinearRegression(),
        "params_grid_search": None
    },
    'Ridge': {
        "alg": Ridge(alpha=1.0),
        "params_grid_search": {
            "alpha": [0.1, 0.5, 1.0]
        }
    },
    'LightGBM': {
        "alg": LGBMRegressor(n_estimators=100, learning_rate=0.2, verbosity=-1),
        "params_grid_search": {
            "learning_rate": [0.1, 0.2],
            "n_estimators": [50, 100]
        }
    },
    'RandomForrest': {
        "alg": RandomForestRegressor(random_state=42, criterion="absolute_error"),
        "params_grid_search": {
            "n_estimators": [50, 100],
            "max_depth": [None, 10, 20]
        }
    }
}

# Pour stocker les performances
performance = {
    name: {"rmse": [], "mae": [], "mape": []}
    for name in models
}
best_parameters = {name: {} for name in models}


In [282]:
def load_data(path="dataset/train.csv"):
    """
    Charger les données à partir d'un fichier CSV ou base de donnée."""
    # Charger les données
    # if type(path)=="str":
    data = pd.read_csv(path, delimiter=',', header=0, parse_dates=True, index_col=0)
    data = data.sort_index()
    data = data.sort_values(by=["item", "store"])
    return data

In [283]:
# def resample_data(data, freq='W'):
#     """
#     Pour chaque couple (item, store), étend les dates à une fréquence journalière (avec ventes = 0 si manquant),
#     puis fait un resample à la fréquence spécifiée (ex: hebdomadaire).
#     """
#     data = data.copy()
#     # data['date'] = pd.to_datetime(data['date'])
#     # data.set_index('date', inplace=True)
    
#     # Créer la plage de dates complète pour ce groupe
#     full_range = pd.date_range(start=data.index.min(), end=data.index.max(), freq='D')

#     full_data = []

#     # Boucle sur chaque groupe item/store
#     for (item, store), group in data.groupby(['item', 'store']):
        
#         # Reindex pour inclure toutes les dates avec 0 si manquant
#         group = group.reindex(full_range, fill_value=0)
#         group.index.name = 'date'

#         # Ajouter les colonnes manquantes (item, store)
#         group['item'] = item
#         group['store'] = store

#         # Resample à la fréquence souhaitée
#         group_resampled = group.resample(freq).sum()  # ou .mean() selon ton besoin

#         # Remettre item/store
#         group_resampled['item'] = item
#         group_resampled['store'] = store

#         full_data.append(group_resampled)

#     # Fusionner tous les groupes
#     result = pd.concat(full_data).reset_index()
#     return result


In [284]:
def pivot_full_series(data, start_date, end_date):
    """
    Crée un DataFrame avec full_range en index et chaque colonne = produit-store.
    Les dates manquantes sont remplies avec 0.
    """
    # Convertir en datetime
    data = data.copy()
    
    # S'assurer que l'index est bien la date
    if 'date' in data.columns:
        data['date'] = pd.to_datetime(data['date'])
        data = data.set_index('date')

    # Créer la clé colonne "produit-store"
    data['produit_store'] = data['item'].astype(str) + '-' + data['store'].astype(str)

    # Pivot
    pivot = data.pivot_table(
        index='date',
        columns='produit_store',
        values='sales',  # <-- à adapter si ta colonne s'appelle différemment
        aggfunc='sum'
    )

    # Reindex sur full_range
    full_range = pd.date_range(start=start_date, end=end_date, freq='D')
    pivot = pivot.reindex(full_range)

    # Remplir les valeurs manquantes avec 0
    pivot.fillna(0, inplace=True)

    return pivot


In [285]:
data=load_data()
data

Unnamed: 0_level_0,store,item,sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-01,1,1,13
2013-01-02,1,1,11
2013-01-03,1,1,14
2013-01-04,1,1,13
2013-01-05,1,1,10
...,...,...,...
2017-12-27,10,50,63
2017-12-28,10,50,59
2017-12-29,10,50,74
2017-12-30,10,50,62


In [286]:
df_pivot = pivot_full_series(data, start_date=data.index.min(), end_date=data.index.max())

In [287]:
df_pivot.shape

(1826, 500)

In [288]:
df_weekly = df_pivot.resample('W').sum()
# df_weekly.reset_index(inplace=True)

In [289]:
df_weekly

produit_store,1-1,1-10,1-2,1-3,1-4,1-5,1-6,1-7,1-8,1-9,...,9-1,9-10,9-2,9-3,9-4,9-5,9-6,9-7,9-8,9-9
2013-01-06,73,84,98,96,78,68,65,44,101,95,...,152,183,196,194,176,128,150,127,203,184
2013-01-13,66,117,116,87,107,78,78,47,86,81,...,185,237,260,233,199,148,173,139,237,215
2013-01-20,80,103,113,110,84,56,62,63,99,95,...,184,257,255,235,220,156,160,149,254,226
2013-01-27,70,95,100,102,75,58,50,63,100,86,...,170,217,237,228,228,168,157,148,268,202
2013-02-03,86,100,112,118,99,73,66,76,101,123,...,184,262,287,232,244,173,170,143,252,258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-03,133,178,195,186,157,122,106,109,196,167,...,355,420,495,412,377,258,293,255,465,376
2017-12-10,126,130,142,166,138,100,95,102,161,128,...,263,344,387,346,319,208,239,194,373,319
2017-12-17,117,144,177,165,145,107,95,80,144,126,...,273,334,391,378,338,249,228,211,365,298
2017-12-24,97,148,150,169,141,93,98,82,162,144,...,252,310,355,340,326,224,234,205,376,325


In [290]:
data_train=df_weekly.iloc[:-TEST_SIZE].copy()
data_test=df_weekly.iloc[-TEST_SIZE:].copy()

In [291]:
print("Train shape:", data_train.shape)
print("Test shape:", data_test.shape)

Train shape: (237, 500)
Test shape: (24, 500)


In [292]:
# Create and train ForecasterRecursiveMultiSeries
# ==============================================================================
forecaster = ForecasterRecursiveMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1),
                 lags               = 24,
                 window_features    = RollingFeatures(stats=['mean', 'mean'], window_sizes=[24, 48]),
                 encoding           = 'ordinal',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 differentiation    = None,
                 dropna_from_series = False,
                 fit_kwargs         = None,
                 forecaster_id      = None
             )

forecaster.fit(series=data_train, store_in_sample_residuals=True)
forecaster

In [293]:
# Predictions and prediction intervals
# ==============================================================================
steps = 24

# Predictions for item_1
predictions = forecaster.predict(steps=TEST_SIZE)
display(predictions.head(20))

# # Interval predictions for item_1 and item_2
# predictions_intervals = forecaster.predict_interval(
#     steps    = steps,
#     levels   = ['item_1', 'item_2'],
#     method   = "conformal",
#     interval = 0.9
# )
# display(predictions_intervals.head(3))

Unnamed: 0,level,pred
2017-07-23,1-1,202.702179
2017-07-23,1-10,250.78966
2017-07-23,1-2,302.255588
2017-07-23,1-3,260.994714
2017-07-23,1-4,232.596984
2017-07-23,1-5,161.839061
2017-07-23,1-6,160.383454
2017-07-23,1-7,154.617289
2017-07-23,1-8,261.767379
2017-07-23,1-9,239.544235


In [294]:
predictions.index.name = 'date'
pivot=predictions.pivot_table(
    index="date",
    columns='level',
    values='pred',
    aggfunc='sum')

In [295]:
pivot=pivot.astype(np.int32)

In [296]:
def compute_metrics_per_column(y_true, y_pred):
    metrics = {}
    for col in y_true.columns:
        rmse = np.sqrt(mean_squared_error(y_true[col], y_pred[col]))
        mae = mean_absolute_error(y_true[col], y_pred[col])
        mape = mean_absolute_percentage_error(y_true[col], y_pred[col]) * 100
        metrics[col] = {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}
    return pd.DataFrame(metrics).T  # transpose for readability

metrics_df = compute_metrics_per_column(data_test, pivot)
print(metrics_df)

           RMSE        MAE       MAPE
1-1   31.945918  27.708333  19.748187
1-10  24.753788  18.916667  11.041645
1-2   38.299804  32.458333  16.407158
1-3   32.280025  27.083333  14.533243
1-4   27.449954  22.583333  13.278279
...         ...        ...        ...
9-5   33.387248  27.791667  10.022262
9-6   41.251263  34.666667  12.011830
9-7   34.589859  27.208333  10.859973
9-8   65.668676  57.708333  12.436892
9-9   37.554960  31.791667   7.943645

[500 rows x 3 columns]


In [297]:
metrics_df.describe()

Unnamed: 0,RMSE,MAE,MAPE
count,500.0,500.0,500.0
mean,38.8275,31.942417,9.791003
std,13.651705,11.610874,4.620556
min,13.9239,10.458333,2.699975
25%,29.191358,23.833333,6.53012
50%,36.512266,29.8125,8.863404
75%,45.188466,36.958333,12.100826
max,113.823145,99.875,29.948641


In [300]:
metrics_df.index
metrics_1=[metric for metric in metrics_df.index if metric.endswith('1')]

In [305]:
metrics_df.loc[metrics_1].describe()

Unnamed: 0,RMSE,MAE,MAPE
count,50.0,50.0,50.0
mean,38.24904,31.6575,10.33721
std,11.90653,10.266393,4.252557
min,19.075508,14.708333,3.350905
25%,30.974158,25.635417,7.143053
50%,38.23904,31.208333,9.826738
75%,43.06172,35.145833,12.288594
max,74.546406,63.416667,20.64961


In [306]:
metrics_1

['1-1',
 '10-1',
 '11-1',
 '12-1',
 '13-1',
 '14-1',
 '15-1',
 '16-1',
 '17-1',
 '18-1',
 '19-1',
 '2-1',
 '20-1',
 '21-1',
 '22-1',
 '23-1',
 '24-1',
 '25-1',
 '26-1',
 '27-1',
 '28-1',
 '29-1',
 '3-1',
 '30-1',
 '31-1',
 '32-1',
 '33-1',
 '34-1',
 '35-1',
 '36-1',
 '37-1',
 '38-1',
 '39-1',
 '4-1',
 '40-1',
 '41-1',
 '42-1',
 '43-1',
 '44-1',
 '45-1',
 '46-1',
 '47-1',
 '48-1',
 '49-1',
 '5-1',
 '50-1',
 '6-1',
 '7-1',
 '8-1',
 '9-1']