In [1]:
import pandas as pd
import numpy as np
from supervised.automl import AutoML
from dateutil.relativedelta import relativedelta

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


## DF Original

In [2]:
df_canada = pd.read_csv('canada.csv')
df_canada.head()

Unnamed: 0,date,QUALITY_FLAG,cid,industry_raw,E_TTM_period_date,E_TTM_ammor_intangibles,E_TTM_asset_writedown,E_TTM_assets_gro_five,E_TTM_capex,E_TTM_cash_acquisitions,...,E_G_cost_debt,E_G_cash_ratio,E_G_ebitda_cov,E_G_ret_on_asset,E_G_ret_on_inv_cap,E_G_net_to_cash,E_G_perm_assets_ratio,return_1q,target_net_income,target_cash_operations
0,2002-01-03,True,SP-065996,,2001-10-31,0.0,0.0,0.0,-12.738,-3.336,...,-0.036902,2.309257,-165.453488,0.130018,0.101871,-0.068216,0.41423,,-5.548,-5.809
1,2002-01-08,True,SP-002396,,2001-09-30,3.078,0.0,0.0,-20.889,-68.22,...,-0.103778,0.0,-2.685925,0.071119,0.06743,-0.004881,0.595752,,6.114,-11.388
2,2002-01-08,True,SP-006704,,2001-09-30,0.0,0.0,0.0,-17.971623,0.0,...,-0.036358,2.451612,2.26246,-0.069781,-0.039238,-0.045993,0.775432,,8.957867,-1.346965
3,2002-01-08,True,SP-008644,,2001-09-30,0.0,0.0,0.0,-34.7,0.0,...,-0.117333,0.878049,-4.852273,-0.169833,-0.155712,-0.316372,0.773996,,71.2,-9.4
4,2002-01-08,True,SP-013994,,2001-09-30,0.0,0.0,0.0,-1403.0,-133.0,...,-0.078051,0.0,-14.569697,0.109798,0.078497,-0.157934,0.921832,,-165.0,-419.0


## Copie du DF Original - sera envoyé à la fonciton

In [3]:
df_model = df_canada.copy()

df_model['date'] = pd.to_datetime(df_model['date'], errors='coerce')
df_model.sort_values(by=['cid', 'date'], inplace=True)

In [4]:
# Retirer les lignes où Quality_Flag est False
df_model = df_model[df_model['QUALITY_FLAG'] == True]

# (FACULTATIF) Exclure les banques
# df_model = df_model[df_model['industry'] != 'Banks']

In [5]:
# Définir la liste des colonnes "ratios" qu’on veut conserver
# ---------------------------
ratio_keywords = [
    '_on_assets_ratio',
    '_on_rev_ratio',
    '_on_tot_cap_ratio'
]

# On va chercher toutes les colonnes qui contiennent l’un de ces substrings
ratio_cols = [
    col for col in df_model.columns
    if any(rk in col for rk in ratio_keywords)
]

# ---------------------------
# Définir les colonnes minimales qu’on veut absolument
# ---------------------------
mandatory_cols = ['date', 'cid', 'target_net_income']

# ---------------------------
# Construire la liste finale des colonnes à garder
# ---------------------------
columns_to_keep = mandatory_cols + ratio_cols

# ---------------------------
# Créer le df_model_final avec seulement ces colonnes
# ---------------------------
df_model_final = df_model[columns_to_keep].copy()

# ---------------------------
# (Optionnel) trier par (cid, date)
# ---------------------------
df_model_final.sort_values(by=['cid', 'date'], inplace=True)


In [6]:
# Compter le nombre total de NaN dans tout le DataFrame
total_nan = df_model_final.isna().sum().sum()
print(f"Nombre total de valeurs NaN dans df_model_final : {total_nan}")

# Retirer les lignes qui contiennent AU MOINS un NaN
df_model_final.dropna(inplace=True)

# Vérifier à nouveau qu’il n’y a plus de NaN
total_nan_apres = df_model_final.isna().sum().sum()
print(f"Nombre total de valeurs NaN après suppression : {total_nan_apres}")

Nombre total de valeurs NaN dans df_model_final : 124606
Nombre total de valeurs NaN après suppression : 0


## DF Test (5 ans seulement), pour tester rapidement la fonciton

In [7]:
# Date la plus récente du DataFrame
max_date = df_model_final['date'].max()

# Date de coupure (5 ans avant)
cutoff_date = max_date - pd.DateOffset(years=5)

# Filtrer pour ne garder que les 5 dernières années
df_test = df_model_final[df_model_final['date'] >= cutoff_date].copy()

print(df_test['date'].min(), df_test['date'].max())
print(df_test.shape)

2019-09-24 00:00:00 2024-09-19 00:00:00
(7730, 195)


## Fonction Principale avec AutoML et Rolling Window

In [8]:
def pipeline_rolling_windows(data, date_col, target_col, train_years, val_years, test_years, buffer_months=0):
    """
    Pipeline direct pour la rolling window avec AutoML et cross-validation personnalisée.
    Ajoute les périodes dans le DataFrame final pour validation.
    """
    # Conversion de la colonne date
    data[date_col] = pd.to_datetime(data[date_col])
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    predictions_all = []  # Liste pour stocker toutes les prédictions

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_end = start_date + relativedelta(years=train_years) - pd.Timedelta(days=1)
        tampon_1_end = train_end + relativedelta(months=buffer_months)
        val_start = tampon_1_end + pd.Timedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - pd.Timedelta(days=1)
        tampon_2_end = val_end + relativedelta(months=buffer_months)
        test_start = tampon_2_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - pd.Timedelta(days=1)

        # Filtrer les données
        train_data = data.loc[(data[date_col] >= start_date) & (data[date_col] <= train_end)]
        val_data = data.loc[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data.loc[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        if len(train_data) == 0 or len(val_data) == 0 or len(test_data) == 0:
            print(f"Fenêtre {start_date.year}-{test_end.year} : données insuffisantes, sautée.")
            start_date += relativedelta(years=1)
            continue

        # Configurer et entraîner AutoML
        print(f"Fenêtre {start_date.year}-{test_end.year} : entraînement de AutoML...")
        automl = AutoML(mode="Perform", algorithms=["Xgboost"])
        custom_cv = [(train_data.index, val_data.index)]
        automl.fit(
            train_data.drop(columns=[target_col, date_col, 'cid']),
            train_data[target_col], cv=custom_cv
        )

        # Prédire sur le test set
        test_preds = test_data[[date_col, target_col]].copy()
        test_preds["predicted"] = automl.predict(test_data.drop(columns=[target_col, date_col, 'cid']))
        test_preds["window"] = f"{start_date.year}-{test_end.year}"
        test_preds["cid"] = test_data["cid"].values

        # Ajouter les périodes pour validation
        test_preds["train_start"] = start_date
        test_preds["train_end"] = train_end
        test_preds["tampon_1"] = tampon_1_end
        test_preds["val_start"] = val_start
        test_preds["val_end"] = val_end
        test_preds["tampon_2"] = tampon_2_end
        test_preds["test_start"] = test_start
        test_preds["test_end"] = test_end

        # Sauvegarder les prédictions
        predictions_all.append(test_preds)

        # Avancer la fenêtre
        start_date += relativedelta(years=1)

    predictions_df = pd.concat(predictions_all, ignore_index=True)
    return predictions_df

## Appel de fonction

In [None]:
predictions_df = pipeline_rolling_windows(
    data=df_test, 
    date_col="date", 
    target_col="target_net_income", 
    train_years=2, 
    val_years=1, 
    test_years=1, 
    buffer_months=1
)