In [11]:
import pandas as pd
import numpy as np
from supervised.automl import AutoML
from dateutil.relativedelta import relativedelta

In [12]:
df_canada = pd.read_csv('canada_updated.csv')
df_canada.head()

Unnamed: 0,date,QUALITY_FLAG,cid,industry_raw,E_TTM_period_date,E_TTM_ammor_intangibles,E_TTM_asset_writedown,E_TTM_assets_gro_five,E_TTM_capex,E_TTM_cash_acquisitions,...,E_G_ebitda_cov,E_G_ret_on_asset,E_G_ret_on_inv_cap,E_G_net_to_cash,E_G_perm_assets_ratio,return_1q,target_net_income,target_cash_operations,binary_target_net_income,binary_target_cash_operations
0,2002-01-03,True,SP-065996,,2001-10-31,0.0,0.0,0.0,-12.738,-3.336,...,-165.453488,0.130018,0.101871,-0.068216,0.41423,,,,0,0
1,2002-01-08,True,SP-002396,,2001-09-30,3.078,0.0,0.0,-20.889,-68.22,...,-2.685925,0.071119,0.06743,-0.004881,0.595752,,,,0,0
2,2002-01-08,True,SP-006704,,2001-09-30,0.0,0.0,0.0,-17.971623,0.0,...,2.26246,-0.069781,-0.039238,-0.045993,0.775432,,,,0,0
3,2002-01-08,True,SP-008644,,2001-09-30,0.0,0.0,0.0,-34.7,0.0,...,-4.852273,-0.169833,-0.155712,-0.316372,0.773996,,,,0,0
4,2002-01-08,True,SP-013994,,2001-09-30,0.0,0.0,0.0,-1403.0,-133.0,...,-14.569697,0.109798,0.078497,-0.157934,0.921832,,,,0,0


In [13]:
df_model = df_canada.copy()

df_model['date'] = pd.to_datetime(df_model['date'], errors='coerce')
df_model.sort_values(by=['cid', 'date'], inplace=True)

# Retirer les lignes où Quality_Flag est False
df_model = df_model[df_model['QUALITY_FLAG'] == True]

# (FACULTATIF) Exclure les banques
# df_model = df_model[df_model['industry'] != 'Banks']

In [14]:
def select_features(
    df,
    include_agro=False,
    include_rgro=False,
    include_tcgro=False,
    include_ratios_assets=False,
    include_ratios_rev=False,
    include_ratios_totcap=False,
    mandatory_cols=None
):
    """
    Sélectionne dynamiquement les colonnes d'un DataFrame en fonction
    des familles de variables explicatives demandées,
    en plaçant d'abord les colonnes obligatoires (mandatory_cols).
    """
    
    # 1) Gérer la liste mandatory_cols (par défaut : vide ou ['cid','date'] selon besoin)
    if mandatory_cols is None:
        mandatory_cols = []
    
    # 2) Définir les "familles" de motifs
    family_patterns = {
        'agro': ['_agro_1q', '_agro_4q'],
        'rgro': ['_rgro_1q', '_rgro_4q'],
        'tcgro': ['_tcgro_1q', '_tcgro_4q'],
        'ratios_assets': ['_on_assets_ratio'],
        'ratios_rev': ['_on_rev_ratio'],
        'ratios_totcap': ['_on_tot_cap_ratio']
    }
    
    # 3) Construire la liste des motifs à inclure
    patterns_to_keep = []
    if include_agro:
        patterns_to_keep += family_patterns['agro']
    if include_rgro:
        patterns_to_keep += family_patterns['rgro']
    if include_tcgro:
        patterns_to_keep += family_patterns['tcgro']
    
    if include_ratios_assets:
        patterns_to_keep += family_patterns['ratios_assets']
    if include_ratios_rev:
        patterns_to_keep += family_patterns['ratios_rev']
    if include_ratios_totcap:
        patterns_to_keep += family_patterns['ratios_totcap']
    
    # 4) Retrouver toutes les colonnes du df qui matchent nos motifs
    matched_cols = set()
    for pat in patterns_to_keep:
        for col in df.columns:
            if pat in col:
                matched_cols.add(col)
    # => matched_cols est un set() de colonnes
    
    # 5) Conserver l'ordre original des colonnes matched, 
    #    en filtrant df.columns dans l'ordre d'origine
    matched_cols_in_order = [c for c in df.columns if c in matched_cols]
    
    # 6) Construire l'ordre final :
    #    - d'abord mandatory_cols (dans l'ordre donné),
    #    - puis les matched_cols (dans l'ordre d'origine)
    #    - attention aux colonnes obligatoires qui n'existent pas, 
    #      ou aux duplications
    #    - on fait donc une intersection + un set() pour éviter 
    #      les collisions.
    
    # Intersection pour ne pas inclure des mandatory inexistantes
    mandatory_cols_in_df = [c for c in mandatory_cols if c in df.columns]
    
    # Puis on concatène en évitant toute duplication
    columns_to_keep_ordered = mandatory_cols_in_df + [
        c for c in matched_cols_in_order if c not in mandatory_cols_in_df
    ]
    
    # 7) Extraire le sous-DataFrame
    df_filtered = df[columns_to_keep_ordered].copy()
    
    # 8) (Optionnel) trier par cid/date si elles sont présentes
    if 'cid' in df_filtered.columns and 'date' in df_filtered.columns:
        df_filtered.sort_values(by=['cid', 'date'], inplace=True)
    
    return df_filtered

In [15]:
df_model_final = select_features(
    df_model,
    include_agro=True,
    include_rgro=True,
    include_tcgro=True,
    include_ratios_assets=True,
    include_ratios_rev=True,
    include_ratios_totcap=True,
    mandatory_cols=['cid', 'date', 'binary_target_net_income']  # je garde la target
)

# Compter le nombre total de NaN dans tout le DataFrame
total_nan = df_model_final.isna().sum().sum()
print(f"Nombre total de valeurs NaN dans df_model_final : {total_nan}")

# Retirer les lignes qui contiennent AU MOINS un NaN
df_model_final.dropna(inplace=True)

# Vérifier à nouveau qu’il n’y a plus de NaN
total_nan_apres = df_model_final.isna().sum().sum()
print(f"Nombre total de valeurs NaN après suppression : {total_nan_apres}")

print(df_model_final.shape)

Nombre total de valeurs NaN dans df_model_final : 1360861
Nombre total de valeurs NaN après suppression : 0
(36154, 579)


In [16]:
# Date la plus récente du DataFrame
#max_date = df_model_final['date'].max()

# Date de coupure (5 ans avant)
#cutoff_date = max_date - pd.DateOffset(years=5)

# Filtrer pour ne garder que les 5 dernières années
#df_test = df_model_final[df_model_final['date'] >= cutoff_date].copy()

#print(df_test['date'].min(), df_test['date'].max())
#print(df_test.shape)

In [19]:
def pipeline_rolling_windows(data, date_col, target_col, train_years, val_years, test_years, buffer_months=0):
    """
    Pipeline direct pour la rolling window avec AutoML et cross-validation personnalisée.
    Ajoute les périodes dans le DataFrame final pour validation.
    """
    # Conversion de la colonne date
    data[date_col] = pd.to_datetime(data[date_col])
    start_date = data[date_col].min()
    end_date = data[date_col].max()

    predictions_all = []  # Liste pour stocker toutes les prédictions

    while start_date + relativedelta(years=train_years + val_years + test_years) <= end_date:
        # Définir les périodes
        train_end = start_date + relativedelta(years=train_years) - pd.Timedelta(days=1)
        tampon_1_end = train_end + relativedelta(months=buffer_months)
        val_start = tampon_1_end + pd.Timedelta(days=1)
        val_end = val_start + relativedelta(years=val_years) - pd.Timedelta(days=1)
        tampon_2_end = val_end + relativedelta(months=buffer_months)
        test_start = tampon_2_end + pd.Timedelta(days=1)
        test_end = test_start + relativedelta(years=test_years) - pd.Timedelta(days=1)

        # Filtrer les données
        train_data = data.loc[(data[date_col] >= start_date) & (data[date_col] <= train_end)]
        val_data = data.loc[(data[date_col] >= val_start) & (data[date_col] <= val_end)]
        test_data = data.loc[(data[date_col] >= test_start) & (data[date_col] <= test_end)]

        if len(train_data) == 0 or len(val_data) == 0 or len(test_data) == 0:
            print(f"Fenêtre {start_date.year}-{test_end.year} : données insuffisantes, sautée.")
            start_date += relativedelta(years=1)
            continue

        # Configurer et entraîner AutoML
        print(f"Fenêtre {start_date.year}-{test_end.year} : entraînement de AutoML...")
        automl = AutoML(results_path=f"AutoML_{start_date.year}-{test_end.year}", mode="Perform", algorithms=["Xgboost"], eval_metric="auc")
        custom_cv = [(train_data.index, val_data.index)]
        automl.fit(
            train_data.drop(columns=[target_col, date_col, 'cid']),
            train_data[target_col], cv=custom_cv
        )

        # Prédire sur le test set
        test_preds = test_data[[date_col, target_col]].copy()
        #test_preds["predicted"] = automl.predict_proba(test_data.drop(columns=[target_col, date_col, 'cid']))
        proba = automl.predict_proba(test_data.drop(columns=[target_col, date_col, 'cid']))
        test_preds["proba_class_0"] = proba[:, 0]  # Probabilité pour la classe 0 (diminution des bénéfices)
        test_preds["proba_class_1"] = proba[:, 1]  # Probabilité pour la classe 1 (augmentation des bénéfices)
        test_preds["margin_proba"] = test_preds["proba_class_1"] - test_preds["proba_class_0"]
        test_preds["window"] = f"{start_date.year}-{test_end.year}"
        test_preds["cid"] = test_data["cid"].values

        # Ajouter les périodes pour validation
        #test_preds["train_start"] = start_date
        #test_preds["train_end"] = train_end
        #test_preds["tampon_1"] = tampon_1_end
        #test_preds["val_start"] = val_start
        #test_preds["val_end"] = val_end
        #test_preds["tampon_2"] = tampon_2_end
        #test_preds["test_start"] = test_start
        #test_preds["test_end"] = test_end

        # Sauvegarder les prédictions
        predictions_all.append(test_preds)

        # Avancer la fenêtre
        start_date += relativedelta(years=1)

    predictions_df = pd.concat(predictions_all, ignore_index=True)
    return predictions_df

In [20]:
predictions_df = pipeline_rolling_windows(
    data=df_model_final, 
    date_col="date", 
    target_col="binary_target_net_income", 
    train_years=2, 
    val_years=1, 
    test_years=1, 
    buffer_months=1
)

Fenêtre 2002-2006 : entraînement de AutoML...
AutoML directory: AutoML_2002-2006
The task is binary_classification with evaluation metric auc
AutoML will use algorithms: ['Xgboost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 1 model
1_Default_Xgboost auc 0.876088 trained in 24.78 seconds (1-sample predict time 0.0965 seconds)
* Step not_so_random will try to check up to 4 models
2_Xgboost auc 0.873305 trained in 19.79 seconds (1-sample predict time 0.0963 seconds)
3_Xgboost auc 0.870281 trained in 21.21 seconds (1-sample predict time 0.0917 seconds)
4_Xgboost auc 0.857249 trained in 16.13 seconds (1-sample predict time 0.1141 seconds)
5_Xgboost auc 0.778136 trained in 10.52 seconds (1-sample pr



None 56
Add Golden Feature: E_G_net_income_inc_on_rev_ratio_sum_E_G_ebit_rgro_1q
Add Golden Feature: E_G_net_income_inc_on_rev_ratio_sum_E_G_cost_borrowing_tcgro_4q
Add Golden Feature: E_G_net_income_inc_on_rev_ratio_sum_E_G_net_inc_gro_five_on_rev_ratio
Add Golden Feature: E_G_net_income_inc_on_assets_ratio_sum_E_G_net_income_inc_agro_4q
Add Golden Feature: E_G_net_inc_gro_five_on_rev_ratio_sum_E_G_net_income_inc_rgro_4q
Add Golden Feature: E_G_net_income_inc_on_rev_ratio_sum_E_G_operating_income_rgro_1q
Add Golden Feature: E_G_gross_profit_on_rev_ratio_multiply_E_G_net_income_inc_on_assets_ratio
Add Golden Feature: E_G_net_income_inc_on_rev_ratio_multiply_E_G_gross_profit_on_assets_ratio
Add Golden Feature: E_G_asset_writedown_on_assets_ratio_sum_E_G_net_income_inc_rgro_4q
Add Golden Feature: E_G_unusual_items_on_tot_cap_ratio_sum_E_G_net_income_inc_on_assets_ratio
Add Golden Feature: E_G_net_income_inc_on_rev_ratio_sum_E_G_depre_amor_rgro_4q
Add Golden Feature: E_G_net_income_inc_on

In [11]:
predictions_df.to_csv("df_can_test.csv", index=False)

In [21]:
# Assure que les colonnes 'date' sont au bon format datetime dans les deux DataFrames
predictions_df['date'] = pd.to_datetime(predictions_df['date'])
df_canada['date'] = pd.to_datetime(df_canada['date'])

# Faire la jointure sur 'cid' et 'date'
merged_df = predictions_df.merge(df_canada[['cid', 'date', 'return_1q']], on=['cid', 'date'], how='left')

In [29]:
def create_weighted_portfolios(predictions_df, df_canada, proba_col, return_col, lower_threshold=0.4, upper_threshold=0.6):
    """
    Crée un portefeuille pondéré basé sur les prédictions et calcule les rendements pondérés par année,
    en attribuant des poids positifs aux positions longues et négatifs aux positions courtes, avec une somme neutre.
    
    Args:
    - predictions_df (pd.DataFrame): DataFrame contenant les prédictions et les identifiants 'cid'.
    - df_canada (pd.DataFrame): DataFrame contenant les rendements futurs et les identifiants 'cid'.
    - return_col (str): Nom de la colonne des rendements futurs dans df_canada.
    - lower_threshold (float): Seuil inférieur pour les positions courtes.
    - upper_threshold (float): Seuil supérieur pour les positions longues.
    
    Returns:
    - result_df (pd.DataFrame): DataFrame contenant les rendements pondérés des portefeuilles par année.
    """
    # Joindre les deux DataFrames sur 'cid' et 'date'
    merged_df = predictions_df.merge(df_canada[['cid', 'date', return_col]], on=['cid', 'date'], how='left')
    
    # Extraire l'année à partir de la colonne 'date'
    merged_df['year'] = merged_df['date'].dt.year
    
    # Retirer les lignes avec des valeurs manquantes
    merged_df.dropna(inplace=True)
    
    # Initialiser une liste pour stocker les résultats
    results = []

    # Grouper par année
    for year, group in merged_df.groupby('year'):
        # Sélectionner les positions longues et courtes selon les seuils
        selected_long = group[group[proba_col] > upper_threshold]  # Positions longues
        selected_short = group[group[proba_col] < lower_threshold]  # Positions courtes
        
        n_long = len(selected_long)
        n_short = len(selected_short)
        
        if n_long > 0 or n_short > 0:
            # Attribuer des poids aux positions longues et courtes
            if n_long > 0:
                selected_long.loc[:, 'weight'] = 1 / n_long  # Poids positifs pour les positions longues
            if n_short > 0:
                selected_short.loc[:, 'weight'] = -1 / n_short  # Poids négatifs pour les positions courtes
            
            # Combiner les deux DataFrames
            selected = pd.concat([selected_long, selected_short], ignore_index=True)
            
            # Calculer le rendement pondéré du portefeuille (somme des rendements pondérés)
            weighted_return = (selected['weight'] * selected[return_col] * 100).sum()
            
            # Ajouter le résultat à la liste
            results.append({'year': year, 'weighted_return': weighted_return})
        else:
            # Si aucune action ne respecte les seuils, le rendement est NaN
            results.append({'year': year, 'weighted_return': float('nan')})

    # Convertir les résultats en DataFrame
    result_df = pd.DataFrame(results)
    
    return result_df

In [33]:
test_returns_margin_proba = create_weighted_portfolios(predictions_df, df_canada, 'return_1q', lower_threshold=0.5, upper_threshold=0.5)
test_returns_margin_proba

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,year,weighted_return
0,2005,-2.474524
1,2006,-1.688775
2,2007,0.169548
3,2008,5.246891
4,2009,17.606542
5,2010,-4.897388
6,2011,-8.216942
7,2012,1.692117
8,2013,-7.424846
9,2014,1.90979


In [23]:
test_returns = create_weighted_portfolios(predictions_df, df_canada, 'return_1q', lower_threshold=0.4, upper_threshold=0.6)
test_returns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,year,weighted_return
0,2005,-0.017009
1,2006,0.001678
2,2007,-0.019855
3,2008,0.087988
4,2009,0.157221
5,2010,-0.038352
6,2011,-0.076744
7,2012,0.00997
8,2013,-0.109282
9,2014,0.020942


In [24]:
test_returns["weighted_return"] = test_returns["weighted_return"] * 100

In [25]:
test_returns

Unnamed: 0,year,weighted_return
0,2005,-1.700894
1,2006,0.167756
2,2007,-1.985497
3,2008,8.798805
4,2009,15.722116
5,2010,-3.835199
6,2011,-7.67441
7,2012,0.996973
8,2013,-10.928237
9,2014,2.094222


In [26]:
test_returns.to_csv("test_returns_net_income_full_year_full_features.csv", index=False)

## Test fonction rendement

In [27]:
# Extraire l'année à partir de la colonne 'date'
merged_df['year'] = merged_df['date'].dt.year

In [28]:
# Voir les valeurs uniques et leur compte dans la colonne 'cid'
value_counts = merged_df['year'].value_counts()
print(value_counts)

year
2023    1396
2024    1056
Name: count, dtype: int64


In [77]:
selected_by_year = {}  # Dictionnaire pour stocker les DataFrames selected par année

for year, group in merged_df.groupby('year'):
    print(f"Année {year} : {group.shape[0]} lignes avant sélection")
    
    # Appliquer la formule de sélection
    selected_long = group[group['proba_class_1'] > 0.6]  # Positions longues
    selected_short = group[group['proba_class_1'] < 0.4]  # Positions courtes
    
    # Combiner les positions longues et courtes
    selected = pd.concat([selected_long, selected_short], ignore_index=True)
    
    print(f"Année {year} : {selected.shape[0]} lignes après sélection")
    
    if len(selected) > 0:
        # Calcul des poids pour les positions longues
        n_long = len(selected_long)
        n_short = len(selected_short)
        
        if n_long > 0:
            selected.loc[selected['proba_class_1'] > 0.6, 'weight'] = 1 / n_long
        if n_short > 0:
            selected.loc[selected['proba_class_1'] < 0.4, 'weight'] = -1 / n_short
        
        # Calculer le rendement pondéré pour cette année (pondérer return_1q par les poids)
        selected['weighted_return'] = selected['weight'] * selected['return_1q']
        
        # Vérifier la somme des poids (elle doit être proche de 0)
        total_weight = selected['weight'].sum()
        selected['weight_check'] = total_weight
        selected['weight_valid'] = abs(total_weight) < 1e-6  # Tolérance d'erreur
        
    else:
        # Si aucune action sélectionnée, ajouter les colonnes avec NaN
        selected['weight'] = np.nan
        selected['return_1q'] = np.nan
        selected['weighted_return'] = np.nan
        selected['weight_check'] = np.nan
        selected['weight_valid'] = False
    
    # Stocker le DataFrame dans le dictionnaire
    selected_by_year[year] = selected.copy()

# Exporter chaque DataFrame selected avec les vérifications dans des fichiers CSV
for year, df in selected_by_year.items():
    df.to_csv(f"selected_{year}.csv", index=False)

Année 2023 : 1396 lignes avant sélection
Année 2023 : 1197 lignes après sélection
Année 2024 : 1056 lignes avant sélection
Année 2024 : 878 lignes après sélection
