In [1]:
import pandas as pd
import numpy as np
from supervised.automl import AutoML
from dateutil.relativedelta import relativedelta

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
def generate_random_dataset(n_actions=50, start_date="2010-01-01", end_date="2020-12-31"):
    """
    Génère un DataFrame fictif avec des données temporelles pour plusieurs actions.
    - n_actions : nombre d'actions uniques (identifiants `sid`).
    - start_date : date de début.
    - end_date : date de fin.
    """
    # Générer une plage de dates quotidienne
    date_range = pd.date_range(start=start_date, end=end_date, freq="Q")  # Fréquence trimestrielle
    
    # Générer des identifiants pour les actions
    sids = [f"SID-{i:04d}" for i in range(n_actions)]
    
    # Liste pour stocker les données
    data = []
    
    for sid in sids:
        for date in date_range:
            # Générer des valeurs aléatoires pour les variables explicatives et la cible
            market_cap_usd = np.random.uniform(100, 10000)  # Capitalisation aléatoire
            price_close_usd = np.random.uniform(10, 500)    # Prix de clôture
            trading_volume = np.random.uniform(1000, 100000)  # Volume de trading
            target = np.random.choice([0, 1])  # Variable cible (0 ou 1)
            
            # Ajouter la ligne au dataset
            data.append([date, sid, market_cap_usd, price_close_usd, trading_volume, target])
    
    # Convertir en DataFrame
    df = pd.DataFrame(data, columns=["date", "sid", "market_cap_usd", "price_close_usd", "trading_volume", "target"])
    return df

# Générer un dataset fictif
df_random = generate_random_dataset(n_actions=10, start_date="2015-01-01", end_date="2020-12-31")

# Afficher un aperçu
df_random.head()



Unnamed: 0,date,sid,market_cap_usd,price_close_usd,trading_volume,target
0,2015-03-31,SID-0000,795.551595,31.984442,4348.168854,1
1,2015-06-30,SID-0000,7009.859884,138.910387,18736.503774,1
2,2015-09-30,SID-0000,5887.8192,449.156804,50930.600842,0
3,2015-12-31,SID-0000,6879.403292,265.277646,70100.691097,1
4,2016-03-31,SID-0000,5909.939219,75.278946,74092.584406,1


In [None]:
def pipeline_rolling_windows(data, date_col, target_col, train_years, val_years, test_years, buffer_months):
    """
    Pipeline direct pour la rolling window avec AutoML.
    """
    data[date_col] = pd.to_datetime(data[date_col])  # Assurer le bon format de date
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    predictions_all = []  # Liste pour stocker toutes les prédictions

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_end = start_date + relativedelta(years=train_years) - pd.Timedelta(days=1)
        tampon_1_start = train_end + pd.Timedelta(days=1)
        tampon_1_end = tampon_1_start + relativedelta(months=buffer_months) - pd.Timedelta(days=1)
        val_start = tampon_1_end + pd.Timedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - pd.Timedelta(days=1)
        tampon_2_start = val_end + pd.Timedelta(days=1)
        tampon_2_end = tampon_2_start + relativedelta(months=buffer_months) - pd.Timedelta(days=1)
        test_start = tampon_2_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - pd.Timedelta(days=1)

        # Filtrer les données pour chaque période
        train_data = data.loc[(data[date_col] >= start_date) & (data[date_col] <= train_end)]
        val_data = data.loc[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data.loc[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        # Configurer et entraîner AutoML
        automl = AutoML(mode="Perform", algorithms=["Xgboost"])
        automl.fit(
            train_data.drop(columns=[target_col, date_col]),
            train_data[target_col]
        )

        # Prédire sur le test set
        test_preds = test_data[[date_col, target_col]].copy()
        test_preds["predicted"] = automl.predict(test_data.drop(columns=[target_col, date_col]))
        test_preds["window"] = f"{start_date.year}-{test_end.year}"  # Identifier la fenêtre

        # Sauvegarder les prédictions
        predictions_all.append(test_preds)

        # Avancer la fenêtre
        start_date += relativedelta(years=1)

    # Concaténer toutes les prédictions
    predictions_df = pd.concat(predictions_all, ignore_index=True)

    return print(predictions_df)

In [5]:
def pipeline_rolling_windows(data, date_col, target_col, train_years, val_years, test_years, buffer_months):
    """
    Pipeline direct pour la rolling window avec AutoML et cross-validation personnalisée.
    """
    data[date_col] = pd.to_datetime(data[date_col])  # Assurer le bon format de date
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    predictions_all = []  # Liste pour stocker toutes les prédictions

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_end = start_date + relativedelta(years=train_years) - pd.Timedelta(days=1)
        tampon_1_start = train_end + pd.Timedelta(days=1)
        tampon_1_end = tampon_1_start + relativedelta(months=buffer_months) - pd.Timedelta(days=1)
        val_start = tampon_1_end + pd.Timedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - pd.Timedelta(days=1)
        tampon_2_start = val_end + pd.Timedelta(days=1)
        tampon_2_end = tampon_2_start + relativedelta(months=buffer_months) - pd.Timedelta(days=1)
        test_start = tampon_2_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - pd.Timedelta(days=1)

        # Filtrer les données pour chaque période
        train_data = data.loc[(data[date_col] >= start_date) & (data[date_col] <= train_end)]
        val_data = data.loc[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data.loc[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        # Générer les indices pour la cross-validation personnalisée
        custom_cv = [(train_data.index, val_data.index)]

        # Configurer et entraîner AutoML avec validation personnalisée
        automl = AutoML(
            mode="Perform",
            algorithms=["Xgboost"]
        )
        
        automl.fit(
            train_data.drop(columns=[target_col, date_col]),
            train_data[target_col], cv=custom_cv
        )

        # Prédire sur le test set
        test_preds = test_data[[date_col, target_col]].copy()
        test_preds["predicted"] = automl.predict(test_data.drop(columns=[target_col, date_col]))
        test_preds["window"] = f"{start_date.year}-{test_end.year}"  # Identifier la fenêtre

        # Sauvegarder les prédictions
        predictions_all.append(test_preds)

        # Avancer la fenêtre
        start_date += relativedelta(years=1)

    # Concaténer toutes les prédictions
    predictions_df = pd.concat(predictions_all, ignore_index=True)

    return predictions_df  # Retourner le DataFrame final

In [7]:
predictions = pipeline_rolling_windows(
    data=df_random,           # Votre DataFrame d'entrée
    date_col="date",          # Colonne des dates
    target_col="target",      # Colonne cible (0 ou 1)
    train_years=2,            # 2 ans pour l'entraînement
    val_years=1,              # 1 an pour la validation
    test_years=1,             # 1 an pour le test
    buffer_months=1           # Tampon de 1 mois
)

# Afficher un aperçu des prédictions
print(predictions.head())

# Analyser les résultats
accuracy = (predictions["predicted"] == predictions["target"]).mean()
print(f"Accuracy globale : {accuracy:.2%}")

AutoML directory: AutoML_4
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Xgboost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 1 model
1_Default_Xgboost logloss 0.670802 trained in 7.37 seconds (1-sample predict time 0.0069 seconds)
* Step not_so_random will try to check up to 4 models
2_Xgboost logloss 0.66521 trained in 2.46 seconds (1-sample predict time 0.0066 seconds)
3_Xgboost logloss 0.673821 trained in 1.53 seconds (1-sample predict time 0.0063 seconds)
4_Xgboost logloss 0.691386 trained in 1.08 seconds (1-sample predict time 0.0066 seconds)
5_Xgboost logloss 0.691386 trained in 1.15 seconds (1-sample predict time 0.0071 seconds)
* Step go

In [11]:
predictions

Unnamed: 0,date,target,predicted,window
0,2018-06-30,1,1,2015-2019
1,2018-09-30,1,1,2015-2019
2,2018-12-31,0,1,2015-2019
3,2019-03-31,0,0,2015-2019
4,2018-06-30,1,1,2015-2019
...,...,...,...,...
75,2020-03-31,1,1,2016-2020
76,2019-06-30,0,1,2016-2020
77,2019-09-30,1,0,2016-2020
78,2019-12-31,1,0,2016-2020


In [15]:
def pipeline_rolling_windows(data, date_col, target_col, train_years, val_years, test_years, buffer_months=0):
    """
    Pipeline direct pour la rolling window avec AutoML et cross-validation personnalisée.
    Ajoute les périodes dans le DataFrame final pour validation.
    """
    # Conversion de la colonne date
    data[date_col] = pd.to_datetime(data[date_col])
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    predictions_all = []  # Liste pour stocker toutes les prédictions

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_end = start_date + relativedelta(years=train_years) - pd.Timedelta(days=1)
        tampon_1_end = train_end + relativedelta(months=buffer_months)
        val_start = tampon_1_end + pd.Timedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - pd.Timedelta(days=1)
        tampon_2_end = val_end + relativedelta(months=buffer_months)
        test_start = tampon_2_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - pd.Timedelta(days=1)

        # Filtrer les données
        train_data = data.loc[(data[date_col] >= start_date) & (data[date_col] <= train_end)]
        val_data = data.loc[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data.loc[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        if len(train_data) == 0 or len(val_data) == 0 or len(test_data) == 0:
            print(f"Fenêtre {start_date.year}-{test_end.year} : données insuffisantes, sautée.")
            start_date += relativedelta(years=1)
            continue

        # Configurer et entraîner AutoML
        print(f"Fenêtre {start_date.year}-{test_end.year} : entraînement de AutoML...")
        automl = AutoML(mode="Perform", algorithms=["Xgboost"])
        custom_cv = [(train_data.index, val_data.index)]
        automl.fit(
            train_data.drop(columns=[target_col, date_col]),
            train_data[target_col], cv=custom_cv
        )

        # Prédire sur le test set
        test_preds = test_data[[date_col, target_col]].copy()
        test_preds["predicted"] = automl.predict(test_data.drop(columns=[target_col, date_col]))
        test_preds["window"] = f"{start_date.year}-{test_end.year}"

        # Ajouter les périodes pour validation
        test_preds["train_start"] = start_date
        test_preds["train_end"] = train_end
        test_preds["tampon_1"] = tampon_1_end
        test_preds["val_start"] = val_start
        test_preds["val_end"] = val_end
        test_preds["tampon_2"] = tampon_2_end
        test_preds["test_start"] = test_start
        test_preds["test_end"] = test_end

        # Sauvegarder les prédictions
        predictions_all.append(test_preds)

        # Avancer la fenêtre
        start_date += relativedelta(years=1)

    predictions_df = pd.concat(predictions_all, ignore_index=True)
    return predictions_df

In [16]:
predictions_df = pipeline_rolling_windows(
    data=df_random, 
    date_col="date", 
    target_col="target", 
    train_years=2, 
    val_years=1, 
    test_years=1, 
    buffer_months=2
)

# Affiche un aperçu des périodes
print(predictions_df[["window", "train_start", "train_end", "val_start", "val_end", "test_start", "test_end"]].head())

Fenêtre 2015-2019 : entraînement de AutoML...
AutoML directory: AutoML_8
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Xgboost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 1 model
1_Default_Xgboost logloss 0.670802 trained in 1.97 seconds (1-sample predict time 0.0071 seconds)
* Step not_so_random will try to check up to 4 models
2_Xgboost logloss 0.66521 trained in 1.33 seconds (1-sample predict time 0.0071 seconds)
3_Xgboost logloss 0.673821 trained in 1.55 seconds (1-sample predict time 0.0067 seconds)
4_Xgboost logloss 0.691386 trained in 1.18 seconds (1-sample predict time 0.0074 seconds)
5_Xgboost logloss 0.691386 trained in 1.15 seconds (1

In [17]:
predictions_df

Unnamed: 0,date,target,predicted,window,train_start,train_end,tampon_1,val_start,val_end,tampon_2,test_start,test_end
0,2018-09-30,1,1,2015-2019,2015-03-31,2017-03-30,2017-05-30,2017-05-31,2018-05-30,2018-07-30,2018-07-31,2019-07-30
1,2018-12-31,0,1,2015-2019,2015-03-31,2017-03-30,2017-05-30,2017-05-31,2018-05-30,2018-07-30,2018-07-31,2019-07-30
2,2019-03-31,0,0,2015-2019,2015-03-31,2017-03-30,2017-05-30,2017-05-31,2018-05-30,2018-07-30,2018-07-31,2019-07-30
3,2019-06-30,0,1,2015-2019,2015-03-31,2017-03-30,2017-05-30,2017-05-31,2018-05-30,2018-07-30,2018-07-31,2019-07-30
4,2018-09-30,1,0,2015-2019,2015-03-31,2017-03-30,2017-05-30,2017-05-31,2018-05-30,2018-07-30,2018-07-31,2019-07-30
...,...,...,...,...,...,...,...,...,...,...,...,...
75,2020-06-30,1,0,2016-2020,2016-03-31,2018-03-30,2018-05-30,2018-05-31,2019-05-30,2019-07-30,2019-07-31,2020-07-30
76,2019-09-30,1,0,2016-2020,2016-03-31,2018-03-30,2018-05-30,2018-05-31,2019-05-30,2019-07-30,2019-07-31,2020-07-30
77,2019-12-31,1,0,2016-2020,2016-03-31,2018-03-30,2018-05-30,2018-05-31,2019-05-30,2019-07-30,2019-07-31,2020-07-30
78,2020-03-31,1,1,2016-2020,2016-03-31,2018-03-30,2018-05-30,2018-05-31,2019-05-30,2019-07-30,2019-07-31,2020-07-30
