In [2]:
import pyarrow as pa
import pyarrow.ipc as ipc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from sklearn.metrics import classification_report
from dateutil.relativedelta import relativedelta

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [8]:
# Chemin vers le fichier .arrow
file_path = '/Users/dominicprenovost/Programmation/AutoML/AutoML-Practice/pricing-canada.arrow'

# Lecture du fichier .arrow
with pa.memory_map(file_path, 'r') as source:
    table = ipc.RecordBatchFileReader(source).read_all()

# Conversion en DataFrame Pandas
df = table.to_pandas()

df['date'] = pd.to_datetime(df['date'])

df = df.dropna()

# Affichage du DataFrame
df.head()

Unnamed: 0,date,sid,market_cap_rank_quarterly,market_cap_rep,market_cap_usd,E_D_RAW_dividend_per_share_ex_date,E_D_RAW_CSHOC_shares_outstanding,E_D_RAW_CSHTRD_trading_volume_share,E_D_RAW_price_close_usd,E_D_RAW_price_open_usd,E_D_RAW_price_close_rep,E_D_RAW_price_open_rep,E_D_RAW_price_close_trd,E_D_RAW_price_close_unadj_trd,E_D_RAW_price_close_unadj_usd,E_D_ADJ_price_close_usd,E_D_RAW_split_rate,E_D_RAW_TRF,trading_value_1d_usd
0,2024-01-02,SP-00109601C,238.0,1141.31215,857.206365,0.0,10813000.0,2575.0,79.275535,78.862446,105.55,105.0,105.55,105.55,79.275535,79.275535,1.0,1.638521,204134.5
1,2024-01-02,SP-00118601C,27.0,26753.097226,26753.097226,0.0,494378000.0,899914.0,54.11466,54.662941,54.11466,54.662941,72.05,72.05,54.11466,54.11466,1.0,1.538673,48698540.0
2,2024-01-02,SP-00126201C,311.0,574.93937,431.820241,0.0,38509000.0,7302.0,11.213489,11.266064,14.93,15.0,14.93,14.93,11.213489,112.134888,1.0,2.773217,81880.9
3,2024-01-02,SP-00126307C,212.0,1320.93825,992.118305,0.0,103603000.0,81704.0,9.576154,9.839029,12.75,13.1,12.75,12.75,9.576154,9.576154,1.0,1.051654,782410.1
4,2024-01-02,SP-00182801C,115.0,4396.64862,3302.194918,0.0,100849000.0,192105.0,29.01387,28.91623,38.63,38.5,38.63,38.63,29.01387,116.055478,1.0,2.692941,5573709.0


In [11]:
nan_per_column = df.isna().sum().sum()
print(nan_per_column)

0


In [6]:
df.dtypes

date                                   datetime64[ns]
sid                                            object
market_cap_rank_quarterly                     float64
market_cap_rep                                float64
market_cap_usd                                float64
E_D_RAW_dividend_per_share_ex_date            float64
E_D_RAW_CSHOC_shares_outstanding              float64
E_D_RAW_CSHTRD_trading_volume_share           float64
E_D_RAW_price_close_usd                       float64
E_D_RAW_price_open_usd                        float64
E_D_RAW_price_close_rep                       float64
E_D_RAW_price_open_rep                        float64
E_D_RAW_price_close_trd                       float64
E_D_RAW_price_close_unadj_trd                 float64
E_D_RAW_price_close_unadj_usd                 float64
E_D_ADJ_price_close_usd                       float64
E_D_RAW_split_rate                            float64
E_D_RAW_TRF                                   float64
trading_value_1d_usd        

In [7]:
df.shape

(87067, 19)

In [12]:
selected_columns = ['date', 'market_cap_usd', 'E_D_RAW_TRF', 'trading_value_1d_usd']
df_subset = df[selected_columns]

# Afficher le DataFrame avec les colonnes sélectionnées
print(df_subset)

            date  market_cap_usd  E_D_RAW_TRF  trading_value_1d_usd
0     2024-01-02      857.206365     1.638521          2.041345e+05
1     2024-01-02    26753.097226     1.538673          4.869854e+07
2     2024-01-02      431.820241     2.773217          8.188090e+04
3     2024-01-02      992.118305     1.051654          7.824101e+05
4     2024-01-02     3302.194918     2.692941          5.573709e+06
...          ...             ...          ...                   ...
90438 2024-11-15      230.287148     4.360436          1.739504e+05
90439 2024-11-15    22014.339361     2.119717          3.576080e+07
90440 2024-11-15     1833.141642     4.446972          6.543943e+05
90441 2024-11-15      652.491810     2.572208          1.466988e+05
90442 2024-11-15     1021.309864     1.000000          8.945528e+06

[87067 rows x 4 columns]


In [1]:
# Conversion en matrice NumPy
matrix = df_subset.to_numpy()

# Affichage de la matrice
print(matrix)

NameError: name 'df_subset' is not defined

## Rolling Window

In [9]:
def create_rolling_windows_real_data(data, date_col, features_cols, window_size=1):
    windows = []
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    while start_date + relativedelta(months=2) <= end_date:
        # Définir les périodes
        train_start = start_date
        train_end = start_date + relativedelta(months=window_size) - pd.Timedelta(days=1)
        tampon_start = train_end + pd.Timedelta(days=1)
        tampon_end = tampon_start + relativedelta(months=window_size) - pd.Timedelta(days=1)
        test_start = tampon_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(months=window_size) - pd.Timedelta(days=1)

        # Filtrer les données pour chaque période
        train_data = data[(data[date_col] >= train_start) & (data[date_col] <= train_end)]
        test_data = data[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        # Ajouter la fenêtre
        windows.append({
            "train_features": train_data[features_cols].values,
            "test_features": test_data[features_cols].values,
            "train_dates": train_data[date_col].values,
            "test_dates": test_data[date_col].values
        })

        # Avancer la fenêtre
        start_date += relativedelta(months=1)

    return windows

# Appliquer aux données
features_columns = df.columns.difference(['date'])  # Toutes les colonnes sauf 'date'
rolling_windows = create_rolling_windows_real_data(
    df,
    date_col='date',
    features_cols=features_columns
)

In [10]:
# Exemple de la première fenêtre
print("Première fenêtre :")
print("Train dates:", rolling_windows[0]['train_dates'])
print("Test dates:", rolling_windows[0]['test_dates'])
print("Train features shape:", rolling_windows[0]['train_features'].shape)
print("Test features shape:", rolling_windows[0]['test_features'].shape)

# Visualisation des fenêtres
for i, window in enumerate(rolling_windows[:3]):  # Les trois premières fenêtres
    print(f"Fenêtre {i+1}:")
    print(f"  Train: {window['train_dates'][0]} à {window['train_dates'][-1]}")
    print(f"  Test: {window['test_dates'][0]} à {window['test_dates'][-1]}")

Première fenêtre :
Train dates: ['2024-01-02T00:00:00.000000000' '2024-01-02T00:00:00.000000000'
 '2024-01-02T00:00:00.000000000' ... '2024-02-01T00:00:00.000000000'
 '2024-02-01T00:00:00.000000000' '2024-02-01T00:00:00.000000000']
Test dates: ['2024-03-04T00:00:00.000000000' '2024-03-04T00:00:00.000000000'
 '2024-03-04T00:00:00.000000000' ... '2024-04-01T00:00:00.000000000'
 '2024-04-01T00:00:00.000000000' '2024-04-01T00:00:00.000000000']
Train features shape: (9453, 18)
Test features shape: (8215, 18)
Fenêtre 1:
  Train: 2024-01-02T00:00:00.000000000 à 2024-02-01T00:00:00.000000000
  Test: 2024-03-04T00:00:00.000000000 à 2024-04-01T00:00:00.000000000
Fenêtre 2:
  Train: 2024-02-02T00:00:00.000000000 à 2024-03-01T00:00:00.000000000
  Test: 2024-04-02T00:00:00.000000000 à 2024-05-01T00:00:00.000000000
Fenêtre 3:
  Train: 2024-03-04T00:00:00.000000000 à 2024-04-01T00:00:00.000000000
  Test: 2024-05-02T00:00:00.000000000 à 2024-05-31T00:00:00.000000000


## Binary Classification Exemple


In [26]:
# Chemin vers le fichier .arrow
file_path = '/Users/dominicprenovost/Programmation/AutoML/AutoML-Practice/pricing-canada.arrow'

# Lecture du fichier .arrow
with pa.memory_map(file_path, 'r') as source:
    table = ipc.RecordBatchFileReader(source).read_all()

# Conversion en DataFrame Pandas
df = table.to_pandas()

#df = df.set_index('date', inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    df[df.columns[:-1]], df["trading_value_1d_usd"], test_size=0.25
)

automl = AutoML()
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

#print(classification_report(y_test, predictions))



Linear algorithm was disabled.
AutoML directory: AutoML_5
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline rmse 60594611.475944 trained in 0.16 seconds
2_DecisionTree rmse 28422235.883344 trained in 4.8 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost rmse 12907268.46649 trained in 1.04 seconds
4_Default_NeuralNetwork rmse 33461371.182789 trained in 1.05 seconds
5_Default_RandomForest rmse 18086113.013295 trained in 5.85 seconds
* Step ensemble will try to check up to 1 model
Ensemble rmse 11666373.238693 trained in 0.06 seconds
AutoML fit time: 15.44 seconds
AutoML best model: Ensemble


In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.ipc as ipc
from supervised.automl import AutoML
from sklearn.metrics import mean_squared_error, r2_score

# Chemin vers le fichier .arrow
file_path = '/Users/dominicprenovost/Programmation/AutoML/AutoML-Practice/pricing-canada.arrow'

# Lecture du fichier .arrow
with pa.memory_map(file_path, 'r') as source:
    table = ipc.RecordBatchFileReader(source).read_all()

# Conversion en DataFrame Pandas
df = table.to_pandas()

# Convertir 'date' en datetime et ajouter une colonne 'month'
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month

# Suppression des colonnes inutiles
df.drop(columns=['date'], inplace=True)

# Initialiser une liste pour stocker les résultats
results = []

# Obtenir les mois uniques
unique_months = sorted(df['month'].unique())

# Fenêtre glissante
for i in range(len(unique_months) - 1):
    train_months = unique_months[:i+1]  # Mois jusqu'à i inclus
    test_month = unique_months[i+1]    # Mois suivant pour le test

    # Filtrer les données pour le train et le test
    train_data = df[df['month'].isin(train_months)]
    test_data = df[df['month'] == test_month]

    # Séparer les features (X) et la cible (y)
    X_train = train_data.drop(columns=['trading_value_1d_usd', 'month'])
    y_train = train_data['trading_value_1d_usd']
    X_test = test_data.drop(columns=['trading_value_1d_usd', 'month'])
    y_test = test_data['trading_value_1d_usd']

    # Initialiser AutoML
    automl = AutoML(
        mode="Perform",
        total_time_limit=600,  # Limite pour chaque itération
        eval_metric="rmse"
    )

    # Entraîner le modèle
    automl.fit(X_train, y_train)

    # Faire des prédictions
    predictions = automl.predict(X_test)

    # Évaluer les performances
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)

    # Stocker les résultats
    results.append({
        'train_months': train_months,
        'test_month': test_month,
        'rmse': rmse,
        'r2': r2
    })

    print(f"Train Months: {train_months}, Test Month: {test_month}, RMSE: {rmse}, R2: {r2}")

# Convertir les résultats en DataFrame pour les analyser
results_df = pd.DataFrame(results)
print(results_df)