In [27]:
import pandas as pd
import numpy as np
from supervised.automl import AutoML
from dateutil.relativedelta import relativedelta

## DF Original

In [28]:
df_canada = pd.read_csv('canada_updated.csv')
df_canada.head()

Unnamed: 0,date,QUALITY_FLAG,cid,industry_raw,E_TTM_period_date,E_TTM_ammor_intangibles,E_TTM_asset_writedown,E_TTM_assets_gro_five,E_TTM_capex,E_TTM_cash_acquisitions,...,E_G_ebitda_cov,E_G_ret_on_asset,E_G_ret_on_inv_cap,E_G_net_to_cash,E_G_perm_assets_ratio,return_1q,target_net_income,target_cash_operations,binary_target_net_income,binary_target_cash_operations
0,2002-01-03,True,SP-065996,,2001-10-31,0.0,0.0,0.0,-12.738,-3.336,...,-165.453488,0.130018,0.101871,-0.068216,0.41423,,,,0,0
1,2002-01-08,True,SP-002396,,2001-09-30,3.078,0.0,0.0,-20.889,-68.22,...,-2.685925,0.071119,0.06743,-0.004881,0.595752,,,,0,0
2,2002-01-08,True,SP-006704,,2001-09-30,0.0,0.0,0.0,-17.971623,0.0,...,2.26246,-0.069781,-0.039238,-0.045993,0.775432,,,,0,0
3,2002-01-08,True,SP-008644,,2001-09-30,0.0,0.0,0.0,-34.7,0.0,...,-4.852273,-0.169833,-0.155712,-0.316372,0.773996,,,,0,0
4,2002-01-08,True,SP-013994,,2001-09-30,0.0,0.0,0.0,-1403.0,-133.0,...,-14.569697,0.109798,0.078497,-0.157934,0.921832,,,,0,0


In [15]:
df_canada.tail()

Unnamed: 0,date,QUALITY_FLAG,cid,industry_raw,E_TTM_period_date,E_TTM_ammor_intangibles,E_TTM_asset_writedown,E_TTM_assets_gro_five,E_TTM_capex,E_TTM_cash_acquisitions,...,E_G_ebitda_cov,E_G_ret_on_asset,E_G_ret_on_inv_cap,E_G_net_to_cash,E_G_perm_assets_ratio,return_1q,target_net_income,target_cash_operations,binary_target_net_income,binary_target_cash_operations
47360,2024-12-12,True,SP-035955,"Oil, Gas & Consumable Fuels",2024-10-31,0.03,0.0,47.407,0.0,0.0,...,-0.0,0.016859,0.021506,0.381318,0.194369,,,,0,0
47361,2024-12-12,True,SP-063476,Software,2024-10-31,17.107,0.0,7.1326,-1.977,-43.277,...,-189.097521,0.098301,0.118097,-0.06088,0.540278,,,,0,0
47362,2024-12-12,True,SP-064686,Specialty Retail,2024-10-31,0.0,0.0,9.6318,-20.609,0.0,...,,0.068533,-0.002755,0.046886,0.83164,,,,0,0
47363,2024-12-12,True,SP-122154,Metals & Mining,2024-10-31,0.435,0.0,10.5561,-8.181,0.0,...,-32.123982,0.197987,0.247023,-0.088436,0.377709,,,,0,0
47364,2024-12-13,True,SP-064624,Energy Equipment & Services,2024-09-30,0.0,0.0,0.0,0.0,0.0,...,,,0.112821,,,,,,0,0


In [3]:
df_canada.shape

(47365, 670)

## Copie du DF Original - sera envoyé à la fonciton

In [29]:
df_model = df_canada.copy()

df_model['date'] = pd.to_datetime(df_model['date'], errors='coerce')
df_model.sort_values(by=['cid', 'date'], inplace=True)

In [30]:
# Retirer les lignes où Quality_Flag est False
df_model = df_model[df_model['QUALITY_FLAG'] == True]

# (FACULTATIF) Exclure les banques
# df_model = df_model[df_model['industry'] != 'Banks']

In [31]:
def select_features(
    df,
    include_agro=False,
    include_rgro=False,
    include_tcgro=False,
    include_ratios_assets=False,
    include_ratios_rev=False,
    include_ratios_totcap=False,
    mandatory_cols=None
):
    """
    Sélectionne dynamiquement les colonnes d'un DataFrame en fonction
    des familles de variables explicatives demandées,
    en plaçant d'abord les colonnes obligatoires (mandatory_cols).
    """
    import pandas as pd
    
    # 1) Gérer la liste mandatory_cols (par défaut : vide ou ['cid','date'] selon besoin)
    if mandatory_cols is None:
        mandatory_cols = []
    
    # 2) Définir les "familles" de motifs
    family_patterns = {
        'agro': ['_agro_1q', '_agro_4q'],
        'rgro': ['_rgro_1q', '_rgro_4q'],
        'tcgro': ['_tcgro_1q', '_tcgro_4q'],
        'ratios_assets': ['_on_assets_ratio'],
        'ratios_rev': ['_on_rev_ratio'],
        'ratios_totcap': ['_on_tot_cap_ratio']
    }
    
    # 3) Construire la liste des motifs à inclure
    patterns_to_keep = []
    if include_agro:
        patterns_to_keep += family_patterns['agro']
    if include_rgro:
        patterns_to_keep += family_patterns['rgro']
    if include_tcgro:
        patterns_to_keep += family_patterns['tcgro']
    
    if include_ratios_assets:
        patterns_to_keep += family_patterns['ratios_assets']
    if include_ratios_rev:
        patterns_to_keep += family_patterns['ratios_rev']
    if include_ratios_totcap:
        patterns_to_keep += family_patterns['ratios_totcap']
    
    # 4) Retrouver toutes les colonnes du df qui matchent nos motifs
    matched_cols = set()
    for pat in patterns_to_keep:
        for col in df.columns:
            if pat in col:
                matched_cols.add(col)
    # => matched_cols est un set() de colonnes
    
    # 5) Conserver l'ordre original des colonnes matched, 
    #    en filtrant df.columns dans l'ordre d'origine
    matched_cols_in_order = [c for c in df.columns if c in matched_cols]
    
    # 6) Construire l'ordre final :
    #    - d'abord mandatory_cols (dans l'ordre donné),
    #    - puis les matched_cols (dans l'ordre d'origine)
    #    - attention aux colonnes obligatoires qui n'existent pas, 
    #      ou aux duplications
    #    - on fait donc une intersection + un set() pour éviter 
    #      les collisions.
    
    # Intersection pour ne pas inclure des mandatory inexistantes
    mandatory_cols_in_df = [c for c in mandatory_cols if c in df.columns]
    
    # Puis on concatène en évitant toute duplication
    columns_to_keep_ordered = mandatory_cols_in_df + [
        c for c in matched_cols_in_order if c not in mandatory_cols_in_df
    ]
    
    # 7) Extraire le sous-DataFrame
    df_filtered = df[columns_to_keep_ordered].copy()
    
    # 8) (Optionnel) trier par cid/date si elles sont présentes
    if 'cid' in df_filtered.columns and 'date' in df_filtered.columns:
        df_filtered.sort_values(by=['cid', 'date'], inplace=True)
    
    return df_filtered

In [32]:
df_model_final = select_features(
    df_model,
    include_agro=True,          # inclut _agro_1q, _agro_4q
    include_ratios_assets=True, # inclut _on_assets_ratio
    mandatory_cols=['cid', 'date', 'binary_target_net_income']  # je garde la target
)

In [33]:
df_model_final.head()

Unnamed: 0,cid,date,binary_target_net_income,E_G_ammor_intangibles_agro_1q,E_G_asset_writedown_agro_1q,E_G_assets_gro_five_agro_1q,E_G_capex_agro_1q,E_G_cash_acquisitions_agro_1q,E_G_cash_equi_agro_1q,E_G_cash_financing_agro_1q,...,E_G_total_assets_on_assets_ratio,E_G_total_capital_on_assets_ratio,E_G_total_debt_on_assets_ratio,E_G_total_div_on_assets_ratio,E_G_total_expenses_on_assets_ratio,E_G_total_intangibles_on_assets_ratio,E_G_total_liabilities_on_assets_ratio,E_G_total_revenues_on_assets_ratio,E_G_unusual_items_on_assets_ratio,E_G_working_cap_on_assets_ratio
110,MISSING,2002-02-11,0,,,,,,,,...,1.0,0.858209,0.000635,0.0,0.763373,0.572756,0.142426,0.237885,-5.906412,0.195206
126,MISSING,2002-02-14,0,-0.890015,-0.000209,0.0,0.103063,-0.029078,-0.044276,-0.022887,...,1.0,0.507081,0.009273,0.0,1.593679,0.17338,0.502192,0.929858,-0.601081,0.297643
307,MISSING,2002-03-12,0,0.0,0.0,0.0,0.0,0.0,-0.074434,0.036171,...,1.0,0.988729,0.0,-0.096602,0.000989,0.054931,0.011271,0.053153,-0.190438,0.000509
360,MISSING,2002-04-04,0,0.051806,0.0,-0.015582,-0.028148,-0.134068,0.061842,-0.032078,...,1.0,0.736008,0.221261,-0.018629,0.278409,0.607747,0.485253,0.365472,-0.013537,0.142876
365,MISSING,2002-04-05,0,-0.056509,0.0,0.0,0.02099,0.121196,-0.059799,0.036784,...,1.0,0.903683,0.230682,-0.052332,0.296085,0.163999,0.326999,0.360444,-0.182719,-0.024323


In [8]:
df_all = select_features(
    df_model,
    include_agro=True,
    include_rgro=True,
    include_tcgro=True,
    include_ratios_assets=True,
    include_ratios_rev=True,
    include_ratios_totcap=True,
    mandatory_cols=['cid', 'date', 'binary_target_net_income']
)

In [9]:
df_all.head()

Unnamed: 0,cid,date,binary_target_net_income,E_G_ammor_intangibles_agro_1q,E_G_asset_writedown_agro_1q,E_G_assets_gro_five_agro_1q,E_G_capex_agro_1q,E_G_cash_acquisitions_agro_1q,E_G_cash_equi_agro_1q,E_G_cash_financing_agro_1q,...,E_G_total_assets_on_tot_cap_ratio,E_G_total_capital_on_tot_cap_ratio,E_G_total_debt_on_tot_cap_ratio,E_G_total_div_on_tot_cap_ratio,E_G_total_expenses_on_tot_cap_ratio,E_G_total_intangibles_on_tot_cap_ratio,E_G_total_liabilities_on_tot_cap_ratio,E_G_total_revenues_on_tot_cap_ratio,E_G_unusual_items_on_tot_cap_ratio,E_G_working_cap_on_tot_cap_ratio
110,MISSING,2002-02-11,0,,,,,,,,...,1.165218,1.0,0.00074,0.0,0.889496,0.667385,0.165957,0.277188,-6.882254,0.227458
126,MISSING,2002-02-14,0,-0.890015,-0.000209,0.0,0.103063,-0.029078,-0.044276,-0.022887,...,1.972073,1.0,0.018286,0.0,3.142851,0.341918,0.990359,1.833748,-1.185375,0.586973
307,MISSING,2002-03-12,0,0.0,0.0,0.0,0.0,0.0,-0.074434,0.036171,...,1.0114,1.0,0.0,-0.097703,0.001,0.055557,0.0114,0.053759,-0.192609,0.000515
360,MISSING,2002-04-04,0,0.051806,0.0,-0.015582,-0.028148,-0.134068,0.061842,-0.032078,...,1.35868,1.0,0.300623,-0.025311,0.378268,0.825734,0.659304,0.49656,-0.018392,0.194123
365,MISSING,2002-04-05,0,-0.056509,0.0,0.0,0.02099,0.121196,-0.059799,0.036784,...,1.106583,1.0,0.255268,-0.05791,0.327643,0.181478,0.361851,0.398861,-0.202194,-0.026916


In [7]:
# Définir la liste des colonnes "ratios" qu’on veut conserver
# ---------------------------
ratio_keywords = [
    '_on_assets_ratio',
    '_on_rev_ratio',
    '_on_tot_cap_ratio'
]

# On va chercher toutes les colonnes qui contiennent l’un de ces substrings
ratio_cols = [
    col for col in df_model.columns
    if any(rk in col for rk in ratio_keywords)
]

# ---------------------------
# Définir les colonnes minimales qu’on veut absolument
# ---------------------------
mandatory_cols = ['date', 'cid', 'binary_target_net_income']

# ---------------------------
# Construire la liste finale des colonnes à garder
# ---------------------------
columns_to_keep = mandatory_cols + ratio_cols

# ---------------------------
# Créer le df_model_final avec seulement ces colonnes
# ---------------------------
df_model_final = df_model[columns_to_keep].copy()

# ---------------------------
# (Optionnel) trier par (cid, date)
# ---------------------------
df_model_final.sort_values(by=['cid', 'date'], inplace=True)


In [34]:
# Compter le nombre total de NaN dans tout le DataFrame
total_nan = df_model_final.isna().sum().sum()
print(f"Nombre total de valeurs NaN dans df_model_final : {total_nan}")

# Retirer les lignes qui contiennent AU MOINS un NaN
df_model_final.dropna(inplace=True)

# Vérifier à nouveau qu’il n’y a plus de NaN
total_nan_apres = df_model_final.isna().sum().sum()
print(f"Nombre total de valeurs NaN après suppression : {total_nan_apres}")

Nombre total de valeurs NaN dans df_model_final : 443777
Nombre total de valeurs NaN après suppression : 0


In [35]:
# Compter le nombre total de NaN dans tout le DataFrame
total_nan = df_model_final.isna().sum().sum()
print(f"Nombre total de valeurs NaN dans df_model_final : {total_nan}")

# Retirer les lignes qui contiennent AU MOINS un NaN
df_model_final.dropna(inplace=True)

# Vérifier à nouveau qu’il n’y a plus de NaN
total_nan_apres = df_model_final.isna().sum().sum()
print(f"Nombre total de valeurs NaN après suppression : {total_nan_apres}")

Nombre total de valeurs NaN dans df_model_final : 0
Nombre total de valeurs NaN après suppression : 0


In [11]:
df_all.shape

(36154, 579)

In [36]:
df_model_final.shape

(37311, 195)

## DF Test (5 ans seulement), pour tester rapidement la fonciton

In [38]:
# Date la plus récente du DataFrame
max_date = df_model_final['date'].max()

# Date de coupure (5 ans avant)
cutoff_date = max_date - pd.DateOffset(years=5)

# Filtrer pour ne garder que les 5 dernières années
df_test = df_model_final[df_model_final['date'] >= cutoff_date].copy()

print(df_test['date'].min(), df_test['date'].max())
print(df_test.shape)

2019-12-12 00:00:00 2024-12-12 00:00:00
(7552, 195)


In [12]:
# Date la plus récente du DataFrame
max_date = df_all['date'].max()

# Date de coupure (5 ans avant)
cutoff_date = max_date - pd.DateOffset(years=5)

# Filtrer pour ne garder que les 5 dernières années
df_test = df_all[df_all['date'] >= cutoff_date].copy()

print(df_test['date'].min(), df_test['date'].max())
print(df_test.shape)

2018-12-12 00:00:00 2024-12-12 00:00:00
(8888, 579)


In [39]:
# Extraire l'année à partir de la colonne 'date'
df_test['year'] = df_test['date'].dt.year

In [40]:
# Voir les valeurs uniques et leur compte dans la colonne 'cid'
value_counts = df_test['year'].value_counts()
print(value_counts)

year
2021    1552
2020    1539
2022    1537
2023    1499
2024    1416
2019       9
Name: count, dtype: int64


## Fonction Principale avec AutoML et Rolling Window

In [41]:
def pipeline_rolling_windows(data, date_col, target_col, train_years, val_years, test_years, buffer_months=0):
    """
    Pipeline direct pour la rolling window avec AutoML et cross-validation personnalisée.
    Ajoute les périodes dans le DataFrame final pour validation.
    """
    # Conversion de la colonne date
    data[date_col] = pd.to_datetime(data[date_col])
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    predictions_all = []  # Liste pour stocker toutes les prédictions

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_end = start_date + relativedelta(years=train_years) - pd.Timedelta(days=1)
        tampon_1_end = train_end + relativedelta(months=buffer_months)
        val_start = tampon_1_end + pd.Timedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - pd.Timedelta(days=1)
        tampon_2_end = val_end + relativedelta(months=buffer_months)
        test_start = tampon_2_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - pd.Timedelta(days=1)

        # Filtrer les données
        train_data = data.loc[(data[date_col] >= start_date) & (data[date_col] <= train_end)]
        val_data = data.loc[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data.loc[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        if len(train_data) == 0 or len(val_data) == 0 or len(test_data) == 0:
            print(f"Fenêtre {start_date.year}-{test_end.year} : données insuffisantes, sautée.")
            start_date += relativedelta(years=1)
            continue

        # Configurer et entraîner AutoML
        print(f"Fenêtre {start_date.year}-{test_end.year} : entraînement de AutoML...")
        automl = AutoML(mode="Perform", algorithms=["Xgboost"], eval_metric="auc")
        custom_cv = [(train_data.index, val_data.index)]
        automl.fit(
            train_data.drop(columns=[target_col, date_col, 'cid']),
            train_data[target_col], cv=custom_cv
        )

        # Prédire sur le test set
        test_preds = test_data[[date_col, target_col]].copy()
        test_preds["predicted"] = automl.predict_proba(test_data.drop(columns=[target_col, date_col, 'cid']))
        test_preds["window"] = f"{start_date.year}-{test_end.year}"
        test_preds["cid"] = test_data["cid"].values

        # Ajouter les périodes pour validation
        #test_preds["train_start"] = start_date
        #test_preds["train_end"] = train_end
        #test_preds["tampon_1"] = tampon_1_end
        #test_preds["val_start"] = val_start
        #test_preds["val_end"] = val_end
        #test_preds["tampon_2"] = tampon_2_end
        #test_preds["test_start"] = test_start
        #test_preds["test_end"] = test_end

        # Sauvegarder les prédictions
        predictions_all.append(test_preds)

        # Avancer la fenêtre
        start_date += relativedelta(years=1)

    predictions_df = pd.concat(predictions_all, ignore_index=True)
    return predictions_df

## Appel de fonction

In [44]:
predictions_df = pipeline_rolling_windows(
    data=df_test, 
    date_col="date", 
    target_col="binary_target_net_income", 
    train_years=2, 
    val_years=1, 
    test_years=1, 
    buffer_months=1
)

Fenêtre 2019-2024 : entraînement de AutoML...
AutoML directory: AutoML_1
The task is binary_classification with evaluation metric auc
AutoML will use algorithms: ['Xgboost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 1 model
1_Default_Xgboost auc 0.870033 trained in 21.81 seconds (1-sample predict time 0.0358 seconds)
* Step not_so_random will try to check up to 4 models
2_Xgboost auc 0.864405 trained in 13.75 seconds (1-sample predict time 0.0363 seconds)
3_Xgboost auc 0.873812 trained in 16.42 seconds (1-sample predict time 0.0385 seconds)
4_Xgboost auc 0.844947 trained in 13.85 seconds (1-sample predict time 0.038 seconds)
5_Xgboost auc 0.805015 trained in 10.41 seconds (1-sample predict tim

ValueError: Expected a 1D array, got an array with shape (1496, 2)

In [14]:
predictions_df.to_csv("df_can_test.csv", index=False)

In [18]:
# Générer des valeurs aléatoires entre -1 et 1
np.random.seed(42)  # Fixer la graine pour la reproductibilité
predictions_df['predicted'] = np.random.uniform(-1, 1, size=len(predictions_df))  # Remplace 'n' par la taille désirée

In [17]:
# Assure que les colonnes 'date' sont au bon format datetime dans les deux DataFrames
predictions_df['date'] = pd.to_datetime(predictions_df['date'])
df_canada['date'] = pd.to_datetime(df_canada['date'])

# Faire la jointure sur 'cid' et 'date'
merged_df = predictions_df.merge(df_canada[['cid', 'date', 'return_1q']], on=['cid', 'date'], how='left')

In [18]:
merged_df

Unnamed: 0,date,binary_target_net_income,predicted,window,cid,train_start,train_end,tampon_1,val_start,val_end,tampon_2,test_start,test_end,return_1q
0,2022-03-04,0,0,2018-2023,MISSING,2018-12-12,2020-12-11,2021-01-11,2021-01-12,2022-01-11,2022-02-11,2022-02-12,2023-02-11,
1,2022-04-28,0,1,2018-2023,MISSING,2018-12-12,2020-12-11,2021-01-11,2021-01-12,2022-01-11,2022-02-11,2022-02-12,2023-02-11,
2,2022-10-27,0,0,2018-2023,MISSING,2018-12-12,2020-12-11,2021-01-11,2021-01-12,2022-01-11,2022-02-11,2022-02-12,2023-02-11,
3,2022-11-03,0,1,2018-2023,MISSING,2018-12-12,2020-12-11,2021-01-11,2021-01-12,2022-01-11,2022-02-11,2022-02-12,2023-02-11,
4,2022-02-24,0,0,2018-2023,SP-001096,2018-12-12,2020-12-11,2021-01-11,2021-01-12,2022-01-11,2022-02-11,2022-02-12,2023-02-11,0.042277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,2024-03-07,0,0,2020-2025,SP-329579,2020-12-12,2022-12-11,2023-01-11,2023-01-12,2024-01-11,2024-02-11,2024-02-12,2025-02-11,-0.345906
4246,2024-05-15,0,0,2020-2025,SP-329579,2020-12-12,2022-12-11,2023-01-11,2023-01-12,2024-01-11,2024-02-11,2024-02-12,2025-02-11,0.194066
4247,2024-08-08,0,0,2020-2025,SP-329579,2020-12-12,2022-12-11,2023-01-11,2023-01-12,2024-01-11,2024-02-11,2024-02-12,2025-02-11,-0.133734
4248,2024-11-13,0,0,2020-2025,SP-329579,2020-12-12,2022-12-11,2023-01-11,2023-01-12,2024-01-11,2024-02-11,2024-02-12,2025-02-11,


In [19]:
# Extraire l'année à partir de la colonne 'date'
merged_df['year'] = merged_df['date'].dt.year

In [20]:
# Voir les valeurs uniques et leur compte dans la colonne 'cid'
value_counts = merged_df['year'].value_counts()
print(value_counts)

year
2023    1457
2022    1407
2024    1386
Name: count, dtype: int64


In [21]:
def create_weighted_portfolios(predictions_df, df_canada, return_col, lower_threshold=0.4, upper_threshold=0.6):
    """
    Crée un portefeuille pondéré basé sur les prédictions et calcule les rendements pondérés par année.
    
    Args:
    - predictions_df (pd.DataFrame): DataFrame contenant les prédictions et les identifiants 'cid'.
    - df_canada (pd.DataFrame): DataFrame contenant les rendements futurs et les identifiants 'cid'.
    - return_col (str): Nom de la colonne des rendements futurs dans df_canada.
    - lower_threshold (float): Seuil inférieur pour les positions courtes.
    - upper_threshold (float): Seuil supérieur pour les positions longues.
    
    Returns:
    - result_df (pd.DataFrame): DataFrame contenant les rendements pondérés des portefeuilles par année.
    """
    # Joindre les deux DataFrames sur 'cid' et 'date'
    merged_df = predictions_df.merge(df_canada[['cid', 'date', return_col]], on=['cid', 'date'], how='left')
    
    # Extraire l'année à partir de la colonne 'date'
    merged_df['year'] = merged_df['date'].dt.year
    
    # Retirer les lignes avec des valeurs manquantes
    merged_df.dropna(inplace=True)
    
    # Initialiser une liste pour stocker les résultats
    results = []

    # Grouper par année
    for year, group in merged_df.groupby('year'):
        # Sélectionner les actions longues et courtes selon les seuils
        selected = group[(group['predicted'] > upper_threshold) | (group['predicted'] < lower_threshold)]
        
        if len(selected) > 0:
            # Attribuer des poids égaux à chaque action sélectionnée
            selected['weight'] = 1 / len(selected)
            
            # Calculer le rendement pondéré du portefeuille
            weighted_return = (selected['weight'] * selected[return_col]).sum()
            
            # Ajouter le résultat à la liste
            results.append({'year': year, 'weighted_return': weighted_return})
        else:
            # Si aucune action ne respecte les seuils, le rendement est NaN
            results.append({'year': year, 'weighted_return': float('nan')})

    # Convertir les résultats en DataFrame
    result_df = pd.DataFrame(results)
    
    return result_df

In [22]:
test_returns = create_weighted_portfolios(predictions_df, df_canada, 'return_1q', lower_threshold=0.5, upper_threshold=0.5)

In [23]:
test_returns

Unnamed: 0,year,weighted_return
0,2022,-0.020318
1,2023,0.023368
2,2024,0.040647


In [24]:
merged_df.dropna(inplace=True)

result_1 = []

# Grouper par année
for year, group in merged_df.groupby('year'):
    # Sélectionner les actions longues et courtes selon les seuils
    selected = group[(group['predicted'] > 0.6) | (group['predicted'] < 0.4)]
    
    if len(selected) > 0:
        # Attribuer des poids égaux à chaque action sélectionnée
        selected['weight'] = 1 / len(selected)
        
        # Calculer le rendement pondéré du portefeuille
        weighted_return = (selected['weight'] * selected['return_1q']).sum()
        
        # Ajouter le résultat à la liste
        result_1.append({'year': year, 'weighted_return': weighted_return})
    else:
        # Si aucune action ne respecte les seuils, le rendement est NaN
        result_1.append({'year': year, 'weighted_return': float('nan')})

# Convertir les résultats en DataFrame
result_df1 = pd.DataFrame(result_1)

In [25]:
result_df1

Unnamed: 0,year,weighted_return
0,2022,-0.020318
1,2023,0.023368
2,2024,0.040647


In [26]:
selected

Unnamed: 0,date,binary_target_net_income,predicted,window,cid,train_start,train_end,tampon_1,val_start,val_end,tampon_2,test_start,test_end,return_1q,year,weight
1488,2024-01-15,0,1,2019-2024,SP-001096,2019-12-12,2021-12-11,2022-01-11,2022-01-12,2023-01-11,2023-02-11,2023-02-12,2024-02-11,0.115650,2024,0.000966
1501,2024-02-06,0,1,2019-2024,SP-001263,2019-12-12,2021-12-11,2022-01-11,2022-01-12,2023-01-11,2023-02-11,2023-02-12,2024-02-11,-0.169237,2024,0.000966
1514,2024-02-08,0,0,2019-2024,SP-002137,2019-12-12,2021-12-11,2022-01-11,2022-01-12,2023-01-11,2023-02-11,2023-02-12,2024-02-11,-0.128202,2024,0.000966
1518,2024-02-09,0,1,2019-2024,SP-002408,2019-12-12,2021-12-11,2022-01-11,2022-01-12,2023-01-11,2023-02-11,2023-02-12,2024-02-11,-0.110419,2024,0.000966
1530,2024-01-23,0,0,2019-2024,SP-002696,2019-12-12,2021-12-11,2022-01-11,2022-01-12,2023-01-11,2023-02-11,2023-02-12,2024-02-11,0.048465,2024,0.000966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4242,2024-03-14,0,0,2020-2025,SP-278234,2020-12-12,2022-12-11,2023-01-11,2023-01-12,2024-01-11,2024-02-11,2024-02-12,2025-02-11,-0.092091,2024,0.000966
4243,2024-09-11,0,0,2020-2025,SP-278234,2020-12-12,2022-12-11,2023-01-11,2023-01-12,2024-01-11,2024-02-11,2024-02-12,2025-02-11,-0.066010,2024,0.000966
4245,2024-03-07,0,0,2020-2025,SP-329579,2020-12-12,2022-12-11,2023-01-11,2023-01-12,2024-01-11,2024-02-11,2024-02-12,2025-02-11,-0.345906,2024,0.000966
4246,2024-05-15,0,0,2020-2025,SP-329579,2020-12-12,2022-12-11,2023-01-11,2023-01-12,2024-01-11,2024-02-11,2024-02-12,2025-02-11,0.194066,2024,0.000966
