In [11]:
import pandas as pd
import numpy as np
from supervised.automl import AutoML
from dateutil.relativedelta import relativedelta

In [12]:
def generate_random_dataset(n_actions=50, start_date="2010-01-01", end_date="2020-12-31"):
    """
    Génère un DataFrame fictif avec des données temporelles pour plusieurs actions.
    - n_actions : nombre d'actions uniques (identifiants `sid`).
    - start_date : date de début.
    - end_date : date de fin.
    """
    # Générer une plage de dates quotidienne
    date_range = pd.date_range(start=start_date, end=end_date, freq="Q")  # Fréquence trimestrielle
    
    # Générer des identifiants pour les actions
    sids = [f"SID-{i:04d}" for i in range(n_actions)]
    
    # Liste pour stocker les données
    data = []
    
    for sid in sids:
        for date in date_range:
            # Générer des valeurs aléatoires pour les variables explicatives et la cible
            market_cap_usd = np.random.uniform(100, 10000)  # Capitalisation aléatoire
            price_close_usd = np.random.uniform(10, 500)    # Prix de clôture
            trading_volume = np.random.uniform(1000, 100000)  # Volume de trading
            target = np.random.choice([0, 1])  # Variable cible (0 ou 1)
            
            # Ajouter la ligne au dataset
            data.append([date, sid, market_cap_usd, price_close_usd, trading_volume, target])
    
    # Convertir en DataFrame
    df = pd.DataFrame(data, columns=["date", "sid", "market_cap_usd", "price_close_usd", "trading_volume", "target"])
    return df

# Générer un dataset fictif
df_random = generate_random_dataset(n_actions=1, start_date="2010-01-01", end_date="2020-12-31")

# Afficher un aperçu
df_random.head()



Unnamed: 0,date,sid,market_cap_usd,price_close_usd,trading_volume,target
0,2010-03-31,SID-0000,2654.188472,248.45754,78524.10384,1
1,2010-06-30,SID-0000,2252.233162,207.012006,17039.051407,1
2,2010-09-30,SID-0000,2694.283201,208.748945,44093.922041,1
3,2010-12-31,SID-0000,2859.998104,411.653767,61144.956565,0
4,2011-03-31,SID-0000,2165.327331,109.877148,59997.967085,1


In [13]:
df_random.shape

(44, 6)

In [14]:
df_random.tail()

Unnamed: 0,date,sid,market_cap_usd,price_close_usd,trading_volume,target
39,2019-12-31,SID-0000,1887.190949,458.216054,72983.557465,1
40,2020-03-31,SID-0000,9369.32107,94.924184,16779.016944,1
41,2020-06-30,SID-0000,1005.9957,96.032391,89046.028344,0
42,2020-09-30,SID-0000,6787.150838,83.508567,67452.439032,0
43,2020-12-31,SID-0000,1842.68638,335.711638,33946.451903,1


In [3]:
df_random.to_csv("data.csv", index=False)

In [15]:
def pipeline_rolling_windows(data, date_col, target_col, train_years, val_years, test_years, buffer_months=0):
    """
    Pipeline direct pour la rolling window avec AutoML et cross-validation personnalisée.
    Ajoute les périodes dans le DataFrame final pour validation.
    """
    # Conversion de la colonne date
    data[date_col] = pd.to_datetime(data[date_col])
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    predictions_all = []  # Liste pour stocker toutes les prédictions

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_end = start_date + relativedelta(years=train_years) - pd.Timedelta(days=1)
        tampon_1_end = train_end + relativedelta(months=buffer_months)
        val_start = tampon_1_end + pd.Timedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - pd.Timedelta(days=1)
        tampon_2_end = val_end + relativedelta(months=buffer_months)
        test_start = tampon_2_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - pd.Timedelta(days=1)

        # Filtrer les données
        train_data = data.loc[(data[date_col] >= start_date) & (data[date_col] <= train_end)]
        val_data = data.loc[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data.loc[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        if len(train_data) == 0 or len(val_data) == 0 or len(test_data) == 0:
            print(f"Fenêtre {start_date.year}-{test_end.year} : données insuffisantes, sautée.")
            start_date += relativedelta(years=1)
            continue

        # Configurer et entraîner AutoML
        print(f"Fenêtre {start_date.year}-{test_end.year} : entraînement de AutoML...")
        automl = AutoML(mode="Perform", algorithms=["Xgboost"])
        custom_cv = [(train_data.index, val_data.index)]
        automl.fit(
            train_data.drop(columns=[target_col, date_col]),
            train_data[target_col], cv=custom_cv
        )

        # Prédire sur le test set
        test_preds = test_data[[date_col, target_col]].copy()
        test_preds["predicted"] = automl.predict(test_data.drop(columns=[target_col, date_col]))
        test_preds["window"] = f"{start_date.year}-{test_end.year}"
        test_preds["sid"] = test_data["sid"].values

        # Ajouter les périodes pour validation
        test_preds["train_start"] = start_date
        test_preds["train_end"] = train_end
        test_preds["tampon_1"] = tampon_1_end
        test_preds["val_start"] = val_start
        test_preds["val_end"] = val_end
        test_preds["tampon_2"] = tampon_2_end
        test_preds["test_start"] = test_start
        test_preds["test_end"] = test_end

        # Sauvegarder les prédictions
        predictions_all.append(test_preds)

        # Avancer la fenêtre
        start_date += relativedelta(years=1)

    predictions_df = pd.concat(predictions_all, ignore_index=True)
    return predictions_df

In [16]:
predictions_df = pipeline_rolling_windows(
    data=df_random, 
    date_col="date", 
    target_col="target", 
    train_years=2, 
    val_years=1, 
    test_years=1, 
    buffer_months=1
)

Fenêtre 2010-2014 : entraînement de AutoML...
AutoML directory: AutoML_1
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Xgboost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 1 model
1_Default_Xgboost logloss 0.095361 trained in 16.47 seconds (1-sample predict time 0.0064 seconds)
* Step not_so_random will try to check up to 4 models
2_Xgboost logloss 0.693147 trained in 5.17 seconds (1-sample predict time 0.0058 seconds)
3_Xgboost logloss 0.093797 trained in 13.21 seconds (1-sample predict time 0.0063 seconds)
4_Xgboost logloss 0.693147 trained in 0.75 seconds (1-sample predict time 0.0061 seconds)
5_Xgboost logloss 0.693147 trained in 0.76 seconds

In [17]:
predictions_df.to_csv("rolling_window_2010-2010.csv", index=False)