In [6]:
import pandas as pd
import numpy as np
from supervised.automl import AutoML
from dateutil.relativedelta import relativedelta

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [7]:
def generate_random_dataset(n_actions=50, start_date="2010-01-01", end_date="2020-12-31"):
    """
    Génère un DataFrame fictif avec des données temporelles pour plusieurs actions.
    - n_actions : nombre d'actions uniques (identifiants `sid`).
    - start_date : date de début.
    - end_date : date de fin.
    """
    # Générer une plage de dates quotidienne
    date_range = pd.date_range(start=start_date, end=end_date, freq="Q")  # Fréquence trimestrielle
    
    # Générer des identifiants pour les actions
    sids = [f"SID-{i:04d}" for i in range(n_actions)]
    
    # Liste pour stocker les données
    data = []
    
    for sid in sids:
        for date in date_range:
            # Générer des valeurs aléatoires pour les variables explicatives et la cible
            market_cap_usd = np.random.uniform(100, 10000)  # Capitalisation aléatoire
            price_close_usd = np.random.uniform(10, 500)    # Prix de clôture
            trading_volume = np.random.uniform(1000, 100000)  # Volume de trading
            target = np.random.choice([0, 1])  # Variable cible (0 ou 1)
            
            # Ajouter la ligne au dataset
            data.append([date, sid, market_cap_usd, price_close_usd, trading_volume, target])
    
    # Convertir en DataFrame
    df = pd.DataFrame(data, columns=["date", "sid", "market_cap_usd", "price_close_usd", "trading_volume", "target"])
    return df

# Générer un dataset fictif
df_random = generate_random_dataset(n_actions=10, start_date="2015-01-01", end_date="2020-12-31")

# Afficher un aperçu
df_random.head()



Unnamed: 0,date,sid,market_cap_usd,price_close_usd,trading_volume,target
0,2015-03-31,SID-0000,4148.729684,272.90882,43949.779527,1
1,2015-06-30,SID-0000,100.688595,66.082892,30598.580427,0
2,2015-09-30,SID-0000,6095.174361,338.3729,91240.908293,1
3,2015-12-31,SID-0000,9256.32932,275.487674,79261.468646,0
4,2016-03-31,SID-0000,4170.560452,175.532574,40081.561634,0


In [None]:
def pipeline_rolling_windows(data, date_col, target_col, train_years, val_years, test_years, buffer_months):
    """
    Pipeline direct pour la rolling window avec AutoML.
    """
    data[date_col] = pd.to_datetime(data[date_col])  # Assurer le bon format de date
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    predictions_all = []  # Liste pour stocker toutes les prédictions

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_end = start_date + relativedelta(years=train_years) - pd.Timedelta(days=1)
        tampon_1_start = train_end + pd.Timedelta(days=1)
        tampon_1_end = tampon_1_start + relativedelta(months=buffer_months) - pd.Timedelta(days=1)
        val_start = tampon_1_end + pd.Timedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - pd.Timedelta(days=1)
        tampon_2_start = val_end + pd.Timedelta(days=1)
        tampon_2_end = tampon_2_start + relativedelta(months=buffer_months) - pd.Timedelta(days=1)
        test_start = tampon_2_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - pd.Timedelta(days=1)

        # Filtrer les données pour chaque période
        train_data = data.loc[(data[date_col] >= start_date) & (data[date_col] <= train_end)]
        val_data = data.loc[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data.loc[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        # Configurer et entraîner AutoML
        automl = AutoML(mode="Perform", algorithms=["Xgboost"])
        automl.fit(
            train_data.drop(columns=[target_col, date_col]),
            train_data[target_col]
        )

        # Prédire sur le test set
        test_preds = test_data[[date_col, target_col]].copy()
        test_preds["predicted"] = automl.predict(test_data.drop(columns=[target_col, date_col]))
        test_preds["window"] = f"{start_date.year}-{test_end.year}"  # Identifier la fenêtre

        # Sauvegarder les prédictions
        predictions_all.append(test_preds)

        # Avancer la fenêtre
        start_date += relativedelta(years=1)

    # Concaténer toutes les prédictions
    predictions_df = pd.concat(predictions_all, ignore_index=True)

    return print(predictions_df)

In [None]:
predictions = pipeline_rolling_windows(
    data=df_random,
    date_col="date",
    target_col="target",
    train_years=2,
    val_years=1,
    test_years=1,
    buffer_months=1,
)

AutoML directory: AutoML_1
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Xgboost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 1 model
1_Default_Xgboost logloss 0.690826 trained in 6.88 seconds (1-sample predict time 0.0069 seconds)
* Step not_so_random will try to check up to 4 models
2_Xgboost logloss 0.691655 trained in 3.56 seconds (1-sample predict time 0.0063 seconds)
3_Xgboost logloss 0.692445 trained in 1.49 seconds (1-sample predict time 0.0066 seconds)
4_Xgboost logloss 0.693538 trained in 1.14 seconds (1-sample predict time 0.0063 seconds)
5_Xgboost logloss 0.693538 trained in 2.33 seconds (1-sample predict time 0.0073 seconds)
* Step g