In [7]:
import pandas as pd
import numpy as np
from supervised.automl import AutoML
from dateutil.relativedelta import relativedelta

## DF Original

In [8]:
df_canada = pd.read_csv('canada.csv')
df_canada.head()

Unnamed: 0,date,QUALITY_FLAG,cid,industry_raw,E_TTM_period_date,E_TTM_ammor_intangibles,E_TTM_asset_writedown,E_TTM_assets_gro_five,E_TTM_capex,E_TTM_cash_acquisitions,...,E_G_cost_debt,E_G_cash_ratio,E_G_ebitda_cov,E_G_ret_on_asset,E_G_ret_on_inv_cap,E_G_net_to_cash,E_G_perm_assets_ratio,return_1q,target_net_income,target_cash_operations
0,2002-01-03,True,SP-065996,,2001-10-31,0.0,0.0,0.0,-12.738,-3.336,...,-0.036902,2.309257,-165.453488,0.130018,0.101871,-0.068216,0.41423,,-5.548,-5.809
1,2002-01-08,True,SP-002396,,2001-09-30,3.078,0.0,0.0,-20.889,-68.22,...,-0.103778,0.0,-2.685925,0.071119,0.06743,-0.004881,0.595752,,6.114,-11.388
2,2002-01-08,True,SP-006704,,2001-09-30,0.0,0.0,0.0,-17.971623,0.0,...,-0.036358,2.451612,2.26246,-0.069781,-0.039238,-0.045993,0.775432,,8.957867,-1.346965
3,2002-01-08,True,SP-008644,,2001-09-30,0.0,0.0,0.0,-34.7,0.0,...,-0.117333,0.878049,-4.852273,-0.169833,-0.155712,-0.316372,0.773996,,71.2,-9.4
4,2002-01-08,True,SP-013994,,2001-09-30,0.0,0.0,0.0,-1403.0,-133.0,...,-0.078051,0.0,-14.569697,0.109798,0.078497,-0.157934,0.921832,,-165.0,-419.0


## Copie du DF Original - sera envoyé à la fonciton

In [9]:
df_model = df_canada.copy()

df_model['date'] = pd.to_datetime(df_model['date'], errors='coerce')
df_model.sort_values(by=['cid', 'date'], inplace=True)

In [10]:
# Retirer les lignes où Quality_Flag est False
df_model = df_model[df_model['QUALITY_FLAG'] == True]

# (FACULTATIF) Exclure les banques
# df_model = df_model[df_model['industry'] != 'Banks']

In [11]:
# Définir la liste des colonnes "ratios" qu’on veut conserver
# ---------------------------
ratio_keywords = [
    '_on_assets_ratio',
    '_on_rev_ratio',
    '_on_tot_cap_ratio'
]

# On va chercher toutes les colonnes qui contiennent l’un de ces substrings
ratio_cols = [
    col for col in df_model.columns
    if any(rk in col for rk in ratio_keywords)
]

# ---------------------------
# Définir les colonnes minimales qu’on veut absolument
# ---------------------------
mandatory_cols = ['date', 'cid', 'target_net_income']

# ---------------------------
# Construire la liste finale des colonnes à garder
# ---------------------------
columns_to_keep = mandatory_cols + ratio_cols

# ---------------------------
# Créer le df_model_final avec seulement ces colonnes
# ---------------------------
df_model_final = df_model[columns_to_keep].copy()

# ---------------------------
# (Optionnel) trier par (cid, date)
# ---------------------------
df_model_final.sort_values(by=['cid', 'date'], inplace=True)


In [12]:
# Compter le nombre total de NaN dans tout le DataFrame
total_nan = df_model_final.isna().sum().sum()
print(f"Nombre total de valeurs NaN dans df_model_final : {total_nan}")

# Retirer les lignes qui contiennent AU MOINS un NaN
df_model_final.dropna(inplace=True)

# Vérifier à nouveau qu’il n’y a plus de NaN
total_nan_apres = df_model_final.isna().sum().sum()
print(f"Nombre total de valeurs NaN après suppression : {total_nan_apres}")

Nombre total de valeurs NaN dans df_model_final : 124606
Nombre total de valeurs NaN après suppression : 0


## DF Test (5 ans seulement), pour tester rapidement la fonciton

In [13]:
# Date la plus récente du DataFrame
max_date = df_model_final['date'].max()

# Date de coupure (5 ans avant)
cutoff_date = max_date - pd.DateOffset(years=6)

# Filtrer pour ne garder que les 5 dernières années
df_test = df_model_final[df_model_final['date'] >= cutoff_date].copy()

print(df_test['date'].min(), df_test['date'].max())
print(df_test.shape)

2018-09-20 00:00:00 2024-09-19 00:00:00
(9362, 195)


## Fonction Principale avec AutoML et Rolling Window

In [14]:
def pipeline_rolling_windows(data, date_col, target_col, train_years, val_years, test_years, buffer_months=0):
    """
    Pipeline direct pour la rolling window avec AutoML et cross-validation personnalisée.
    Ajoute les périodes dans le DataFrame final pour validation.
    """
    # Conversion de la colonne date
    data[date_col] = pd.to_datetime(data[date_col])
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    predictions_all = []  # Liste pour stocker toutes les prédictions

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_end = start_date + relativedelta(years=train_years) - pd.Timedelta(days=1)
        tampon_1_end = train_end + relativedelta(months=buffer_months)
        val_start = tampon_1_end + pd.Timedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - pd.Timedelta(days=1)
        tampon_2_end = val_end + relativedelta(months=buffer_months)
        test_start = tampon_2_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - pd.Timedelta(days=1)

        # Filtrer les données
        train_data = data.loc[(data[date_col] >= start_date) & (data[date_col] <= train_end)]
        val_data = data.loc[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data.loc[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        if len(train_data) == 0 or len(val_data) == 0 or len(test_data) == 0:
            print(f"Fenêtre {start_date.year}-{test_end.year} : données insuffisantes, sautée.")
            start_date += relativedelta(years=1)
            continue

        # Configurer et entraîner AutoML
        print(f"Fenêtre {start_date.year}-{test_end.year} : entraînement de AutoML...")
        automl = AutoML(mode="Perform", algorithms=["Xgboost"])
        custom_cv = [(train_data.index, val_data.index)]
        automl.fit(
            train_data.drop(columns=[target_col, date_col, 'cid']),
            train_data[target_col], cv=custom_cv
        )

        # Prédire sur le test set
        test_preds = test_data[[date_col, target_col]].copy()
        test_preds["predicted"] = automl.predict(test_data.drop(columns=[target_col, date_col, 'cid']))
        test_preds["window"] = f"{start_date.year}-{test_end.year}"
        test_preds["cid"] = test_data["cid"].values

        # Ajouter les périodes pour validation
        test_preds["train_start"] = start_date
        test_preds["train_end"] = train_end
        test_preds["tampon_1"] = tampon_1_end
        test_preds["val_start"] = val_start
        test_preds["val_end"] = val_end
        test_preds["tampon_2"] = tampon_2_end
        test_preds["test_start"] = test_start
        test_preds["test_end"] = test_end

        # Sauvegarder les prédictions
        predictions_all.append(test_preds)

        # Avancer la fenêtre
        start_date += relativedelta(years=1)

    predictions_df = pd.concat(predictions_all, ignore_index=True)
    return predictions_df

## Appel de fonction

In [15]:
predictions_df = pipeline_rolling_windows(
    data=df_test, 
    date_col="date", 
    target_col="target_net_income", 
    train_years=2, 
    val_years=1, 
    test_years=1, 
    buffer_months=1
)

Fenêtre 2018-2022 : entraînement de AutoML...
AutoML directory: AutoML_1
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Xgboost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 1 model
1_Default_Xgboost rmse 373.079877 trained in 11.51 seconds (1-sample predict time 0.035 seconds)
* Step not_so_random will try to check up to 4 models
2_Xgboost rmse 373.297357 trained in 9.9 seconds (1-sample predict time 0.0349 seconds)
3_Xgboost rmse 369.928487 trained in 10.17 seconds (1-sample predict time 0.0352 seconds)
4_Xgboost rmse 371.726329 trained in 6.18 seconds (1-sample predict time 0.0351 seconds)
5_Xgboost rmse 376.126961 trained in 10.58 seconds (1-sample predict t

In [16]:
predictions_df.to_csv("df_can_test.csv", index=False)

In [18]:
# Générer des valeurs aléatoires entre -1 et 1
np.random.seed(42)  # Fixer la graine pour la reproductibilité
predictions_df['predicted'] = np.random.uniform(-1, 1, size=len(predictions_df))  # Remplace 'n' par la taille désirée

In [19]:
# Assure que les colonnes 'date' sont au bon format datetime dans les deux DataFrames
predictions_df['date'] = pd.to_datetime(predictions_df['date'])
df_canada['date'] = pd.to_datetime(df_canada['date'])

# Faire la jointure sur 'cid' et 'date'
merged_df = predictions_df.merge(df_canada[['cid', 'date', 'return_1q']], on=['cid', 'date'], how='left')

In [20]:
merged_df

Unnamed: 0,date,target_net_income,predicted,window,cid,train_start,train_end,tampon_1,val_start,val_end,tampon_2,test_start,test_end,return_1q
0,2022-02-17,-240.562,-0.250920,2018-2022,MISSING,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,
1,2022-03-04,-240.562,0.901429,2018-2022,MISSING,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,
2,2022-04-28,49.957,0.463988,2018-2022,MISSING,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,
3,2022-07-28,-322.760,0.197317,2018-2022,MISSING,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,
4,2022-10-27,210.290,-0.687963,2018-2022,MISSING,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3063,2023-09-12,2.276,0.247298,2019-2023,SP-278234,2019-09-20,2021-09-19,2021-10-19,2021-10-20,2022-10-19,2022-11-19,2022-11-20,2023-11-19,-0.131886
3064,2023-03-21,-6.998,0.297010,2019-2023,SP-329579,2019-09-20,2021-09-19,2021-10-19,2021-10-20,2022-10-19,2022-11-19,2022-11-20,2023-11-19,1.296319
3065,2023-05-15,116.437,-0.465196,2019-2023,SP-329579,2019-09-20,2021-09-19,2021-10-19,2021-10-20,2022-10-19,2022-11-19,2022-11-20,2023-11-19,0.532269
3066,2023-08-08,66.105,-0.969779,2019-2023,SP-329579,2019-09-20,2021-09-19,2021-10-19,2021-10-20,2022-10-19,2022-11-19,2022-11-20,2023-11-19,-0.231280


In [24]:
# Extraire l'année à partir de la colonne 'date'
merged_df['year'] = merged_df['date'].dt.year

In [25]:
merged_df['year']

0       2022
1       2022
2       2022
3       2022
4       2022
        ... 
3063    2023
3064    2023
3065    2023
3066    2023
3067    2023
Name: year, Length: 3068, dtype: int32

In [28]:
# Voir les valeurs uniques et leur compte dans la colonne 'cid'
value_counts = merged_df['year'].value_counts()
print(value_counts)

year
2022    1575
2023    1466
2021      27
Name: count, dtype: int64


In [29]:
df_2021 = merged_df[merged_df['year'] == 2021].copy()


Unnamed: 0,date,target_net_income,predicted,window,cid,train_start,train_end,tampon_1,val_start,val_end,tampon_2,test_start,test_end,return_1q,year
156,2021-11-23,-72.0,0.344271,2018-2022,SP-011443,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,0.060325,2021
167,2021-12-16,-4.732,-0.626963,2018-2022,SP-013252,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,0.139527,2021
205,2021-12-09,27.1,-0.981606,2018-2022,SP-014312,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,0.000565,2021
277,2021-12-09,-9.3,0.299928,2018-2022,SP-015083,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,0.01796,2021
281,2021-12-14,-5.313006,-0.32401,2018-2022,SP-015096,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,-0.149961,2021
393,2021-12-01,-8.18,0.552826,2018-2022,SP-020392,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,0.032507,2021
394,2021-12-02,-8.18,-0.318393,2018-2022,SP-020392,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,0.032507,2021
463,2021-11-23,33.031,0.166738,2018-2022,SP-021577,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,-0.218407,2021
471,2021-12-20,16.2,0.939758,2018-2022,SP-021894,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,0.09463,2021
512,2021-12-08,2.521,0.865457,2018-2022,SP-023994,2018-09-20,2020-09-19,2020-10-19,2020-10-20,2021-10-19,2021-11-19,2021-11-20,2022-11-19,0.009628,2021


In [21]:
def create_weighted_portfolios(predictions_df, df_canada, return_col, lower_threshold=0.4, upper_threshold=0.6):
    """
    Crée un portefeuille pondéré basé sur les prédictions et calcule les rendements pondérés par année.
    
    Args:
    - predictions_df (pd.DataFrame): DataFrame contenant les prédictions et les identifiants 'cid'.
    - df_canada (pd.DataFrame): DataFrame contenant les rendements futurs et les identifiants 'cid'.
    - return_col (str): Nom de la colonne des rendements futurs dans df_canada.
    - lower_threshold (float): Seuil inférieur pour les positions courtes.
    - upper_threshold (float): Seuil supérieur pour les positions longues.
    
    Returns:
    - result_df (pd.DataFrame): DataFrame contenant les rendements pondérés des portefeuilles par année.
    """
    # Joindre les deux DataFrames sur 'cid' et 'date'
    merged_df = predictions_df.merge(df_canada[['cid', 'date', return_col]], on=['cid', 'date'], how='left')
    
    # Extraire l'année à partir de la colonne 'date'
    merged_df['year'] = merged_df['date'].dt.year
    
    # Initialiser une liste pour stocker les résultats
    results = []

    # Grouper par année
    for year, group in merged_df.groupby('year'):
        # Sélectionner les actions longues et courtes selon les seuils
        selected = group[(group['predicted'] > upper_threshold) | (group['predicted'] < lower_threshold)]
        
        if len(selected) > 0:
            # Attribuer des poids égaux à chaque action sélectionnée
            selected['weight'] = 1 / len(selected)
            
            # Calculer le rendement pondéré du portefeuille
            weighted_return = (selected['weight'] * selected[return_col]).sum()
            
            # Ajouter le résultat à la liste
            results.append({'year': year, 'weighted_return': weighted_return})
        else:
            # Si aucune action ne respecte les seuils, le rendement est NaN
            results.append({'year': year, 'weighted_return': float('nan')})

    # Convertir les résultats en DataFrame
    result_df = pd.DataFrame(results)
    
    return result_df

In [22]:
test_returns = create_weighted_portfolios(predictions_df, df_canada, 'return_1q', lower_threshold=0.5, upper_threshold=0.5)

In [23]:
test_returns

Unnamed: 0,year,weighted_return
0,2021,-0.064398
1,2022,-0.019383
2,2023,0.024381
