In [7]:
import pandas as pd
import numpy as np

# Génération des données mensuelles
data = pd.DataFrame({
    'date': pd.date_range(start='2024-01-01', end='2024-11-15', freq='D'),
    'value': np.random.randn(320)  # 320 jours de données aléatoires
})

# Vérifions les données
data.head()

Unnamed: 0,date,value
0,2024-01-01,0.417771
1,2024-01-02,0.14771
2,2024-01-03,0.415387
3,2024-01-04,0.355727
4,2024-01-05,1.21272


In [12]:
from dateutil.relativedelta import relativedelta

def generate_rolling_windows(data, date_col, value_col):
    windows = []
    start_date = data[date_col].min()  # Date de début
    end_date = data[date_col].max()    # Date de fin

    while start_date + relativedelta(months=2) <= end_date:
        # Définir les périodes
        train_start = start_date
        train_end = start_date + relativedelta(months=1) - pd.Timedelta(days=1)
        tampon_start = train_end + pd.Timedelta(days=1)
        tampon_end = tampon_start + relativedelta(months=1) - pd.Timedelta(days=1)
        test_start = tampon_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(months=1) - pd.Timedelta(days=1)

        # Sélectionner les données pour chaque période
        train_data = data[(data[date_col] >= train_start) & (data[date_col] <= train_end)]
        test_data = data[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        # Ajouter la fenêtre
        windows.append({
            "train": train_data[value_col].values,
            "test": test_data[value_col].values,
            "train_dates": train_data[date_col].values,
            "test_dates": test_data[date_col].values
        })

        # Avancer la fenêtre
        start_date += relativedelta(months=1)

    return windows

# Génération des fenêtres
windows = generate_rolling_windows(data, date_col='date', value_col='value')

# Exemple de la première fenêtre
print("Première fenêtre :")
print("Train data :", windows[0]['train'])
print("Test data :", windows[0]['test'])

for i, window in enumerate(windows):  # Afficher les trois premières fenêtres
    print(f"Fenêtre {i+1}:")
    print(f"  Train: {window['train_dates'][0]} à {window['train_dates'][-1]}")
    print(f"  Test: {window['test_dates'][0]} à {window['test_dates'][-1]}")

Première fenêtre :
Train data : [ 0.41777101  0.14771018  0.41538702  0.35572692  1.21271964 -1.12783985
 -0.77634768  0.85477269 -0.52453757  0.04500113 -0.45794129 -0.07197561
 -0.17439521 -0.3114376  -1.03750446  0.54134572  0.04332726 -0.00932834
 -0.00931244 -0.16556516 -1.81437237 -0.50740087 -0.74127616  0.59366618
  0.58083429  0.18523792  0.309144    0.21953357 -0.31433325  0.23269736
 -1.10638609]
Test data : [-0.98246279  0.64554702  0.25470427  0.5866218  -0.15961168 -0.859782
 -0.48598429 -1.14012626 -0.00882407 -0.66240595 -0.09839219 -0.58293684
 -0.20644869 -0.65705357  0.40269647 -1.66491387  0.75584886 -0.45148793
 -1.10707025  0.01560546 -0.37267852 -0.30099136  0.32917901 -0.16687011
 -0.10765961  0.47088448 -0.20223732  1.33721065  0.98560333  0.22431766
  2.16402484]
Fenêtre 1:
  Train: 2024-01-01T00:00:00.000000000 à 2024-01-31T00:00:00.000000000
  Test: 2024-03-01T00:00:00.000000000 à 2024-03-31T00:00:00.000000000
Fenêtre 2:
  Train: 2024-02-01T00:00:00.00000000

In [1]:
import pyarrow as pa
import pyarrow.ipc as ipc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from sklearn.metrics import classification_report
from dateutil.relativedelta import relativedelta

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# Chemin vers le fichier .arrow
file_path = '/Users/dominicprenovost/Programmation/AutoML/AutoML-Practice/pricing-canada.arrow'

# Lecture du fichier .arrow
with pa.memory_map(file_path, 'r') as source:
    table = ipc.RecordBatchFileReader(source).read_all()

# Conversion en DataFrame Pandas
df = table.to_pandas()

df['date'] = pd.to_datetime(df['date'])

df = df.dropna()

# Affichage du DataFrame
df.head()

Unnamed: 0,date,sid,market_cap_rank_quarterly,market_cap_rep,market_cap_usd,E_D_RAW_dividend_per_share_ex_date,E_D_RAW_CSHOC_shares_outstanding,E_D_RAW_CSHTRD_trading_volume_share,E_D_RAW_price_close_usd,E_D_RAW_price_open_usd,E_D_RAW_price_close_rep,E_D_RAW_price_open_rep,E_D_RAW_price_close_trd,E_D_RAW_price_close_unadj_trd,E_D_RAW_price_close_unadj_usd,E_D_ADJ_price_close_usd,E_D_RAW_split_rate,E_D_RAW_TRF,trading_value_1d_usd
0,2024-01-02,SP-00109601C,238.0,1141.31215,857.206365,0.0,10813000.0,2575.0,79.275535,78.862446,105.55,105.0,105.55,105.55,79.275535,79.275535,1.0,1.638521,204134.5
1,2024-01-02,SP-00118601C,27.0,26753.097226,26753.097226,0.0,494378000.0,899914.0,54.11466,54.662941,54.11466,54.662941,72.05,72.05,54.11466,54.11466,1.0,1.538673,48698540.0
2,2024-01-02,SP-00126201C,311.0,574.93937,431.820241,0.0,38509000.0,7302.0,11.213489,11.266064,14.93,15.0,14.93,14.93,11.213489,112.134888,1.0,2.773217,81880.9
3,2024-01-02,SP-00126307C,212.0,1320.93825,992.118305,0.0,103603000.0,81704.0,9.576154,9.839029,12.75,13.1,12.75,12.75,9.576154,9.576154,1.0,1.051654,782410.1
4,2024-01-02,SP-00182801C,115.0,4396.64862,3302.194918,0.0,100849000.0,192105.0,29.01387,28.91623,38.63,38.5,38.63,38.63,29.01387,116.055478,1.0,2.692941,5573709.0


In [5]:
def generate_rolling_windows(data, date_col, value_col):
    windows = []
    start_date = data[date_col].min()  # Date de début
    end_date = data[date_col].max()    # Date de fin

    while start_date + relativedelta(months=2) <= end_date:
        # Définir les périodes
        train_start = start_date
        train_end = start_date + relativedelta(months=1) - pd.Timedelta(days=1)
        tampon_start = train_end + pd.Timedelta(days=1)
        tampon_end = tampon_start + relativedelta(months=1) - pd.Timedelta(days=1)
        test_start = tampon_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(months=1) - pd.Timedelta(days=1)

        # Sélectionner les données pour chaque période
        train_data = data[(data[date_col] >= train_start) & (data[date_col] <= train_end)]
        test_data = data[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        # Ajouter la fenêtre
        windows.append({
            "train": train_data[value_col].values,
            "test": test_data[value_col].values,
            "train_dates": train_data[date_col].values,
            "test_dates": test_data[date_col].values
        })

        # Avancer la fenêtre
        start_date += relativedelta(months=1)

    return windows

# Génération des fenêtres
windows = generate_rolling_windows(df, date_col='date', value_col='sid')

# Exemple de la première fenêtre
print("Première fenêtre :")
print("Train data :", windows[0]['train'])
print("Test data :", windows[0]['test'])

Première fenêtre :
Train data : ['SP-00109601C' 'SP-00118601C' 'SP-00126201C' ... 'SP-27812001C'
 'SP-27823401C' 'SP-32957901C']
Test data : ['SP-00109601C' 'SP-00118601C' 'SP-00126201C' ... 'SP-27812001C'
 'SP-27823401C' 'SP-32957901C']


In [3]:
def generate_rolling_windows(data, date_col, sid_col, train_years, val_years, test_years, buffer_months):
    """
    Crée des rolling windows pour des données organisées par action et par date.
    """
    # Convertir la colonne date en format datetime si ce n'est pas déjà fait
    data[date_col] = pd.to_datetime(data[date_col])
    
    # Trouver les dates limites
    start_date = data[date_col].min()
    end_date = data[date_col].max()
    
    rolling_splits = []

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_start = start_date
        train_end = train_start + relativedelta(years=train_years) - relativedelta(days=1)
        
        tampon_1_start = train_end + relativedelta(days=1)
        tampon_1_end = tampon_1_start + relativedelta(months=buffer_months) - relativedelta(days=1)
        
        val_start = tampon_1_end + relativedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - relativedelta(days=1)
        
        tampon_2_start = val_end + relativedelta(days=1)
        tampon_2_end = tampon_2_start + relativedelta(months=buffer_months) - relativedelta(days=1)
        
        test_start = tampon_2_end + relativedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - relativedelta(days=1)
        
        # Sélectionner les données pour chaque période
        train_data = data[(data[date_col] >= train_start) & (data[date_col] <= train_end)]
        val_data = data[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data[(data[date_col] >= test_start) & (data[date_col] <= test_end)]
        
        # Sauvegarder les résultats
        rolling_splits.append({
            "train": train_data,
            "validation": val_data,
            "test": test_data,
            "dates": {
                "train": (train_start, train_end),
                "validation": (val_start, val_end),
                "test": (test_start, test_end)
            }
        })
        
        # Avancer la fenêtre de 1 an
        start_date += relativedelta(years=1)

    return rolling_splits

# Exemple d'utilisation
rolling_windows = generate_rolling_windows(
    data=df,  # Remplacez par votre DataFrame
    date_col="date",
    sid_col="sid",
    train_years=2,
    val_years=1,
    test_years=1,
    buffer_months=1
)

# Affichage d'une fenêtre
print("Exemple d'une fenêtre :")
print("Train dates :", rolling_windows[0]["dates"]["train"])
print("Validation dates :", rolling_windows[0]["dates"]["validation"])
print("Test dates :", rolling_windows[0]["dates"]["test"])

Exemple d'une fenêtre :


IndexError: list index out of range

In [13]:
import pandas as pd
from supervised.automl import AutoML
from dateutil.relativedelta import relativedelta

def pipeline_rolling_windows(data, date_col, target_col, train_years, val_years, test_years, buffer_months, output_file):
    """
    Pipeline direct pour la rolling window avec AutoML.
    """
    data[date_col] = pd.to_datetime(data[date_col])  # Assurer le bon format de date
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    predictions_all = []  # Liste pour stocker toutes les prédictions

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_end = start_date + relativedelta(years=train_years) - pd.Timedelta(days=1)
        tampon_1_start = train_end + pd.Timedelta(days=1)
        tampon_1_end = tampon_1_start + relativedelta(months=buffer_months) - pd.Timedelta(days=1)
        val_start = tampon_1_end + pd.Timedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - pd.Timedelta(days=1)
        tampon_2_start = val_end + pd.Timedelta(days=1)
        tampon_2_end = tampon_2_start + relativedelta(months=buffer_months) - pd.Timedelta(days=1)
        test_start = tampon_2_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - pd.Timedelta(days=1)

        # Filtrer les données pour chaque période
        train_data = data.loc[(data[date_col] >= start_date) & (data[date_col] <= train_end)]
        val_data = data.loc[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data.loc[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        # Configurer et entraîner AutoML
        automl = AutoML(mode="Perform", algorithms=["Xgboost"])
        automl.fit(
            train_data.drop(columns=[target_col, date_col]),
            train_data[target_col]
        )

        # Prédire sur le test set
        test_preds = test_data[[date_col, target_col]].copy()
        test_preds["predicted"] = automl.predict(test_data.drop(columns=[target_col, date_col]))
        test_preds["window"] = f"{start_date.year}-{test_end.year}"  # Identifier la fenêtre

        # Sauvegarder les prédictions
        predictions_all.append(test_preds)

        # Avancer la fenêtre
        start_date += relativedelta(years=1)

    # Concaténer toutes les prédictions et les sauvegarder dans un fichier CSV
    predictions_df = pd.concat(predictions_all, ignore_index=True)
    predictions_df.to_csv(output_file, index=False)
    print(f"Prédictions sauvegardées dans {output_file}")

    return predictions_df