In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import pandas as pd

from sklearn.model_selection import cross_val_score
from tqdm import tqdm

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np

# Machine Learning 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

import matplotlib.pyplot as plt
path = "/Users/julesmourgues/Documents/Programmation/CFM/"

#Load x and y train data
input_test = pd.read_csv(path + "input_test.csv")
input_training = pd.read_csv(path + "input_training.csv")
#output_test_random = pd.read_csv(path + "output_test_random.csv")
output_training = pd.read_csv(path + "output_training_gmEd6Zt.csv")


from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import squareform

def create_and_assign_clusters(df, num_clusters, column):
    # Sort by equity and day
    if column == "returns":
        df['func'] = df.iloc[:, 3:3+53].sum(axis = 1) # Sum of the 52 first columns
    elif column == "volatility":
        df['func'] = df.iloc[:, 3:3+53].std(axis = 1)

    new_df = df.pivot_table(index='day', columns='equity', values='func')
    new_df.columns.name = None
    func = new_df.fillna(0)

    # Calculate the Pearson correlation matrix and convert it to distances
    distance_matrix = np.sqrt(0.5 * (1 - func.corr(method='pearson')))

    # Perform hierarchical clustering using Ward's method
    linked = linkage(np.nan_to_num(squareform(distance_matrix, checks=False), nan=0, posinf=1e9, neginf=-1e9), method='ward')

    # Use the fcluster function to assign cluster labels to tickers
    cluster_labels = fcluster(linked, num_clusters, criterion='maxclust')

    # Create a DataFrame with the cluster labels and corresponding tickers
    clustered_data = pd.DataFrame({'Ticker': func.columns, 'Cluster': cluster_labels})

    # Use a loop to extract each cluster and store it in a separate DataFrame
    clusters = {i: clustered_data[clustered_data['Cluster'] == i]['Ticker'] for i in range(1, num_clusters + 1)}

    if df['equity'].max() >= 1000000:
        for cluster in clusters:
            for index in clusters[cluster].index:
                if column == "returns":
                    df.loc[df['equity'] == (1000000 + index), 'cluster_returns'] = cluster
                elif column == "volatility":
                    df.loc[df['equity'] == (1000000 + index), 'cluster_volatility'] = cluster
    else:
        for cluster in clusters:
            for index in clusters[cluster].index:
                if column == "returns":
                    df.loc[df['equity'] == index, 'cluster_returns'] = cluster
                elif column == "volatility":
                    df.loc[df['equity'] == index, 'cluster_volatility'] = cluster

    #Suoprimer la colonne returns
    df = df.drop(columns = ['func'])
    return df


def preprocess_data(input_training, input_test, outlier=0.05, groupby=False, IQRoutlier = True, variables = False, na_count = False, clusters = False, cluster_type = "returns", suppr = False, nbforlast = 5):
    import pandas as pd
    from datetime import time, timedelta, datetime, date

    df_train = input_training.copy()
    #Ajoute ue colonne qui compte le nombre de na dans chaque ligne
    if na_count is not False:
        df_train['na_count'] = df_train.isnull().sum(axis=1)
      
        #Remplace les inf par des 60
        df_train = df_train.replace([np.inf, -np.inf], 60)
    df_train = df_train.fillna(0)
    df_train = df_train.sort_values(by=['equity', 'day'])

    df_test = input_test.copy()
    #Ajoute ue colonne qui compte le nombre de na dans chaque ligne
    if na_count is not False:
        df_test['na_count'] = df_test.isnull().sum(axis=1)

        #Remplace les inf par des 60
        df_test = df_test.replace([np.inf, -np.inf], 60)
    df_test = df_test.fillna(0)
    df_test = df_test.sort_values(by=['equity', 'day'])

    if outlier is not False:
        for col in df_train.iloc[:,3:3+53].columns:
            Q1 = df_train[col].quantile(outlier)
            Q3 = df_train[col].quantile(1 - outlier)
            IQR = Q3 - Q1
            median = df_train[col].median()
            if IQRoutlier is not False:
                df_train.loc[(df_train[col] < (Q1 - 1.5 * IQR)), col] = Q1 - 1.5 * IQR
                df_train.loc[(df_train[col] > (Q3 + 1.5 * IQR)), col] = Q3 + 1.5 * IQR
            else:
                df_train.loc[((df_train[col] < ((Q1 - 1.5 * IQR))) | (df_train[col] > (Q3 + 1.5 * IQR))), col] = median

        for col in df_test.iloc[:,3:3+53].columns:
            Q1 = df_test[col].quantile(outlier)
            Q3 = df_test[col].quantile(1 - outlier)
            IQR = Q3 - Q1
            median = df_test[col].median()
            if IQRoutlier is not False:
                df_test.loc[(df_test[col] < (Q1 - 1.5 * IQR)), col] = Q1 - 1.5 * IQR
                df_test.loc[(df_test[col] > (Q3 + 1.5 * IQR)), col] = Q3 + 1.5 * IQR
            else:
                df_test.loc[((df_test[col] < ((Q1 - 1.5 * IQR))) | (df_test[col] > (Q3 + 1.5 * IQR))), col] = median


    if clusters is not False:
        if cluster_type == "returns":
            df_train = create_and_assign_clusters(df_train, clusters, "returns")
            df_test = create_and_assign_clusters(df_test, clusters, "returns")
        elif cluster_type == "volatility":
            df_train = create_and_assign_clusters(df_train, clusters, "volatility")
            df_test = create_and_assign_clusters(df_test, clusters, "volatility")

    def volatility(df):
            df['volatility'] = df.iloc[:,3:3+53].std(axis=1)
            df['returns'] = df.iloc[:,3:3+53].sum(axis=1)
            df['last_returns'] = df.iloc[:,3+53-nbforlast:3+53].iloc[:,-1]

    if variables == "volatility":
        volatility(df_train)
        volatility(df_test)
    elif variables == "market+volatility":
        volatility(df_train)
        volatility(df_test)

        def calculate_market_means1(df, group_col, value_cols, beta = True):

            grouped_values = df.groupby(group_col)[value_cols]

            mean_col = grouped_values.mean()
            #rolling_mean_col = mean_col.rolling(5).mean()

            df[f'market_mean_{value_cols}_{group_col}'] = df[group_col].map(mean_col)
            if beta == True:
                #df[f'rolling_market_mean_{value_col}_{group_col}'] = df[group_col].map(rolling_mean_col)
                df[f'{group_col}_{value_cols}_beta'] = df[value_cols] / df[f'market_mean_{value_cols}_{group_col}']

            '''if IQRindicateur is True:
                if value_col == 'volatility':
                    Q1 = grouped_values.quantile(0.25)
                    Q3 = grouped_values.quantile(0.75)
                    IQR = Q3 - Q1

                    df[f'{value_col}_Q1_{group_col}'] = df[group_col].map(Q1)
                    df[f'{value_col}_Q3_{group_col}'] = df[group_col].map(Q3)
                    df[f'{value_col}_IQR_{group_col}'] = df[group_col].map(IQR)'''

        def calculate_market_means(df, group_cols, value_cols, beta = True):
            grouped_values = df.groupby(group_cols)[value_cols]

            mean_col = grouped_values.mean()

            # Create a multi-indexed series with the mean values
            mean_series = pd.Series(mean_col.values, index=pd.MultiIndex.from_tuples(mean_col.index))

            # Map the multi-indexed series to the original dataframe
            df[f'market_mean_{value_cols}_{"_".join(group_cols)}'] = df.set_index(group_cols).index.map(mean_series)

            if beta == True:
                df[f'{"_".join(group_cols)}_{value_cols}_beta'] = df[value_cols] / df[f'market_mean_{value_cols}_{"_".join(group_cols)}']



        # Utiliser la fonction calculate_market_means pour les colonnes 'returns' et 'volatility'
        calculate_market_means1(df_train, 'day', 'volatility') 
        calculate_market_means1(df_test, 'day', 'volatility')
        calculate_market_means1(df_train, 'equity', 'volatility') 
        calculate_market_means1(df_test, 'equity', 'volatility')
        #calculate_market_means1(df_train, 'day', 'returns')
        #calculate_market_means1(df_test, 'day', 'returns')
        calculate_market_means1(df_train, 'day', 'last_returns')
        calculate_market_means1(df_test, 'day', 'last_returns')

        if cluster_type == "returns" and clusters is not False:
            calculate_market_means(df_train, ['cluster_returns', 'day'], 'volatility', beta = True)
            calculate_market_means(df_test, ['cluster_returns', 'day'], 'volatility', beta = True)
        elif cluster_type == "volatility" and clusters is not False:
            calculate_market_means(df_train, ['cluster_volatility', 'day'], 'volatility', beta = True)
            calculate_market_means(df_test, ['cluster_volatility', 'day'], 'volatility', beta = True)
    

    


        #calculate_market_means(df_test, 'day', 'volatility') 
    #calculate_market_means(df, 'equity', ['returns', 'volatility', 'last_returns', 'last_volatility']) 

    if groupby is False:
        #df_train['next_day_r0'] = df_train.groupby('equity')['r0'].shift(-1)
        #df_test['next_day_r0'] = df_test.groupby('equity')['r0'].shift(-1)
        print("No groupby")
    else:
        firstrtime = 35
        new_labels = []
        start_time = time(9, firstrtime)
        for i in range(53):
            new_labels.append(start_time.strftime('%H:%M'))
            start_time = (datetime.combine(date.today(), start_time) + timedelta(minutes=5)).time()

        col_rename_dict = {f'r{i}': new_labels[i] for i in range(53)}
        df_train.rename(columns=col_rename_dict, inplace=True)
        df_test.rename(columns=col_rename_dict, inplace=True)

        id_column_train = df_train['ID']
        id_column_test = df_test['ID']
        interval_dict = {label: "{:.2f}".format(datetime.strptime(label, '%H:%M').hour + (datetime.strptime(label, '%H:%M').minute // groupby) / (60/groupby)) for label in new_labels}
        
        df_train_grouped_interval = df_train.groupby(interval_dict, axis=1).sum()
        df_train_grouped_interval['ID'] = id_column_train
        df_train = df_train.merge(df_train_grouped_interval, on = 'ID', how='left')
        df_train = df_train.drop(df_train.columns[3:3+53], axis = 1)
        #df_train['next_day_first_interval'] = df_train.groupby('equity')[list(interval_dict.values())[1]].shift(-1)

        df_test_grouped_interval = df_test.groupby(interval_dict, axis=1).sum()
        df_test_grouped_interval['ID'] = id_column_test
        df_test = df_test.merge(df_test_grouped_interval, on = 'ID', how='left')
        df_test = df_test.drop(df_test.columns[3:3+53], axis = 1)
        #df_test['next_day_first_interval'] = df_test.groupby('equity')[list(interval_dict.values())[1]].shift(-1)

    if suppr is True:
        df_train = df_train.drop(df_train.columns[3:3+53], axis = 1)
        df_test = df_test.drop(df_test.columns[3:3+53], axis = 1)

    df_train = df_train.merge(output_training, on = 'ID', how='left')



    return df_train, df_test

Tentative de Voting Class en utilisant des data basés sur les clusters rendements pour le modèle 1, et les cluster de volatilités pour le modèle 2

Modèle 2 : Preprocess et modèle

In [2]:
cluster_type = "returns"

df_train, df_test = preprocess_data(input_training, input_test, outlier=0.05, groupby=False, IQRoutlier = False, variables="market+volatility", na_count=True, clusters = 12, cluster_type = cluster_type)

# Définition de l'index des dataframes df_train et df_test
df_train = df_train.set_index('ID')
df_test = df_test.set_index('ID')

# Ajout de 1 à la colonne 'reod' de df_train
df_train['reod'] = df_train['reod'] + 1

# Séparation des données d'entraînement et de test
train = df_train[df_train['day'] <= 350]
test = df_train[df_train['day'] >= 351]

if cluster_type == "returns":
    suppr = ['day', 'equity','volatility', 'cluster_returns','returns']
elif cluster_type == "volatility":
    suppr = ['day', 'equity','volatility', 'cluster_volatility','returns']
    
df_test = df_test.drop(suppr, axis = 1)
df_train = df_train.drop(suppr, axis = 1)
#Drop suppr
test = test.drop(suppr, axis = 1)
train = train.drop(suppr, axis = 1)


# Séparation des features (X) et de la variable cible (y) pour l'entraînement et le test
X_train = train.drop('reod', axis=1)
y_train = train['reod']
X_test = test.drop('reod', axis=1)
y_test = test['reod']

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# Prédiction sur les données de test
from xgboost import XGBClassifier

#Modèle qui s'entraine sur toute la base
params = {"objective": "multi:softmax","num_class": 3, "tree_method": "hist",
            "learning_rate": 0.3, "max_depth": 6,
            "gamma": 0, "subsample": 1, "colsample_bytree": 1,
            "alpha": 0, "lambda": 1,"random_state": 0}
n = 3


#Create an XGBClassifier
model_XG1 = XGBClassifier(n_estimators=n, **params)

# Train the model
model_XG1.fit(df_train.drop('reod', axis=1), df_train['reod'], eval_set=[(X_test, y_test)], early_stopping_rounds=30, verbose=1)

preds_XG = model_XG1.predict(X_test)
preds_XG1 = model_XG1.predict_proba(df_train.drop('reod', axis=1))

# Calcul du RMSE et de l'accuracy
rmse = mean_squared_error(y_test, preds_XG, squared=False)
accuracy = accuracy_score(y_test, preds_XG.round())

print(f"RMSE du modèle de base : {rmse:.3f}")
print(f"Accuracy du modèle de base : {accuracy:.3f}") 

from sklearn.metrics import confusion_matrix

# Calcul de la matrice de confusion
conf_matrix = confusion_matrix(y_test, preds_XG)

# Calcul du ratio de 1, 0 et -1 correctement prédits
ratio_1 = conf_matrix[2, 2] / conf_matrix[2].sum()
ratio_0 = conf_matrix[1, 1] / conf_matrix[1].sum()
ratio_minus_1 = conf_matrix[0, 0] / conf_matrix[0].sum()

print(f"Ratio de 1 bien prédit : {ratio_1}")
print(f"Ratio de 0 bien prédit : {ratio_0}")
print(f"Ratio de -1 bien prédit : {ratio_minus_1}")

probs_XG1 = model_XG1.predict_proba(df_test)


No groupby




[0]	validation_0-mlogloss:1.05413
[1]	validation_0-mlogloss:1.02933
[2]	validation_0-mlogloss:1.01409
RMSE du modèle de base : 0.963
Accuracy du modèle de base : 0.491
Ratio de 1 bien prédit : 0.2060262647282135
Ratio de 0 bien prédit : 0.7348989192386783
Ratio de -1 bien prédit : 0.4180658681085556


Modèle 2 : Preprocess et modèle

In [3]:
cluster_type = "volatility"

df_train, df_test = preprocess_data(input_training, input_test, outlier=0.05, groupby=False, IQRoutlier = False, variables="market+volatility", na_count=True, clusters = 15, cluster_type = cluster_type)

# Définition de l'index des dataframes df_train et df_test
df_train = df_train.set_index('ID')
df_test = df_test.set_index('ID')

# Ajout de 1 à la colonne 'reod' de df_train
df_train['reod'] = df_train['reod'] + 1

# Séparation des données d'entraînement et de test
train = df_train[df_train['day'] <= 350]
test = df_train[df_train['day'] >= 351]

if cluster_type == "returns":
    suppr = ['day', 'equity','volatility', 'cluster_returns','returns']
elif cluster_type == "volatility":
    suppr = ['day', 'equity','volatility', 'cluster_volatility','returns']
    
df_test = df_test.drop(suppr, axis = 1)
df_train = df_train.drop(suppr, axis = 1)
#Drop suppr
test = test.drop(suppr, axis = 1)
train = train.drop(suppr, axis = 1)


# Séparation des features (X) et de la variable cible (y) pour l'entraînement et le test
X_train = train.drop('reod', axis=1)
y_train = train['reod']
X_test = test.drop('reod', axis=1)
y_test = test['reod']

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# Prédiction sur les données de test
from xgboost import XGBClassifier

#Modèle qui s'entraine sur toute la base
params = {"objective": "multi:softmax","num_class": 3, "tree_method": "hist",
            "learning_rate": 0.3, "max_depth": 6,
            "gamma": 0, "subsample": 1, "colsample_bytree": 1,
            "alpha": 0, "lambda": 1,"random_state": 0}
n = 3


#Create an XGBClassifier
model_XG2 = XGBClassifier(n_estimators=n, **params)

# Train the model
model_XG2.fit(df_train.drop('reod', axis=1), df_train['reod'], eval_set=[(X_test, y_test)], early_stopping_rounds=30, verbose=1)

preds_XG = model_XG2.predict(X_test)
preds_XG2 = model_XG2.predict_proba(df_train.drop('reod', axis=1))
# Calcul du RMSE et de l'accuracy
rmse = mean_squared_error(y_test, preds_XG, squared=False)
accuracy = accuracy_score(y_test, preds_XG.round())

print(f"RMSE du modèle de base : {rmse:.3f}")
print(f"Accuracy du modèle de base : {accuracy:.3f}") 

from sklearn.metrics import confusion_matrix

# Calcul de la matrice de confusion
conf_matrix = confusion_matrix(y_test, preds_XG)

# Calcul du ratio de 1, 0 et -1 correctement prédits
ratio_1 = conf_matrix[2, 2] / conf_matrix[2].sum()
ratio_0 = conf_matrix[1, 1] / conf_matrix[1].sum()
ratio_minus_1 = conf_matrix[0, 0] / conf_matrix[0].sum()

print(f"Ratio de 1 bien prédit : {ratio_1}")
print(f"Ratio de 0 bien prédit : {ratio_0}")
print(f"Ratio de -1 bien prédit : {ratio_minus_1}")

probs_XG2 = model_XG2.predict_proba(df_test)


No groupby




[0]	validation_0-mlogloss:1.05412
[1]	validation_0-mlogloss:1.02931
[2]	validation_0-mlogloss:1.01401
RMSE du modèle de base : 0.960
Accuracy du modèle de base : 0.491
Ratio de 1 bien prédit : 0.20021934141334607
Ratio de 0 bien prédit : 0.739321495976488
Ratio de -1 bien prédit : 0.41684992109280006


On fait le voting class des deux modèles

In [4]:
# Importer la classe VotingClassifier du module sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Créer une liste de classifieurs pour le vote
classifiers = [('xgb1', model_XG1), ('xgb2', model_XG2)]

# Créer un VotingClassifier
voting_clf = VotingClassifier(estimators=classifiers, voting='hard')

# Entraîner le VotingClassifier
voting_clf.fit(X_train, y_train)

# Prédire les résultats sur l'ensemble de test
voting_preds = voting_clf.predict(X_test)

# Calculer le RMSE et l'accuracy
rmse = mean_squared_error(y_test, voting_preds, squared=False)
accuracy = accuracy_score(y_test, voting_preds.round())

print(f"RMSE du modèle de vote : {rmse:.3f}")
print(f"Accuracy du modèle de vote : {accuracy:.3f}") 

# Calculer la matrice de confusion
conf_matrix = confusion_matrix(y_test, voting_preds)

# Calculer le ratio de 1, 0 et -1 correctement prédits
ratio_1 = conf_matrix[2, 2] / conf_matrix[2].sum()
ratio_0 = conf_matrix[1, 1] / conf_matrix[1].sum()
ratio_minus_1 = conf_matrix[0, 0] / conf_matrix[0].sum()

print(f"Ratio de 1 bien prédit : {ratio_1}")
print(f"Ratio de 0 bien prédit : {ratio_0}")
print(f"Ratio de -1 bien prédit : {ratio_minus_1}")

# Obtenir les prédictions brutes
voting_results = voting_clf.predict(df_test)

RMSE du modèle de vote : 1.022
Accuracy du modèle de vote : 0.467
Ratio de 1 bien prédit : 0.20412811788194932
Ratio de 0 bien prédit : 0.6943633273551865
Ratio de -1 bien prédit : 0.3964245983494166


Moyenne des scores puis Argmax

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

# Moyenne des probabilités
combined_probs = (probs_XG1 + probs_XG2) / 2

# Choisir les classes avec la probabilité la plus élevée
combined_preds = np.argmax(combined_probs, axis=1)

Max Voting

In [7]:
# Prendre le maximum des probabilités des deux modèles
max_probs = np.maximum(probs_XG1, probs_XG2)

# Choisir la classe avec la probabilité la plus élevée
max_voting_predictions = np.argmax(max_probs, axis=1)

Stacking

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np

# Supposons que model_XG1 et model_XG2 sont déjà entraînés
#df_train.drop('reod', axis=1), df_train['reod']
# Prédictions des probabilités sur l'ensemble de validation

# Empiler les prédictions pour former de nouvelles features pour le métamodèle
meta_features = np.column_stack([preds_XG1, preds_XG2])

# Entraîner le métamodèle
meta_model = LogisticRegression()
meta_model.fit(meta_features, df_train['reod'])

# Utiliser le métamodèle pour faire des prédictions finales
# Prédictions des probabilités sur l'ensemble de tes

# Empiler les prédictions de test
test_meta_features = np.column_stack([probs_XG1, probs_XG2])

# Prédictions finales
final_predictions = meta_model.predict(test_meta_features)

Save pred and model

In [6]:
# Prédire les valeurs de test
results = max_voting_predictions

#Crée un dataframe avec les résultats
results = pd.DataFrame(results, columns = ['reod'])
#Ajoute une colonne ID
results['ID'] = df_test.index
#Reorder columns
results = results[['ID', 'reod']]
#Remplace reod = 2 par reod = -1
results['reod'] = results['reod'] - 1
#export to csv
results.to_csv(path+'Combined maxvoting 10 10.csv', index=False)



from joblib import dump

# Save the model
dump(model, path+'model_Xgboost_lr0,06RM7_train_seed0_n55_350day.joblib')

NameError: name 'max_voting_predictions' is not defined