# Used libraries

In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Chargement des données

The train and test inputs are composed of 46 features.

The target of this challenge is `RET` and corresponds to the fact that the **return is in the top 50% of highest stock returns**.

Since the median is very close to 0, this information should not change much with the idea to predict the sign of the return.

In [None]:
x_train = pd.read_csv('../x_train.csv', index_col='ID')
y_train = pd.read_csv('../y_train.csv', index_col='ID')
train = pd.concat([x_train, y_train], axis=1)
test = pd.read_csv('../x_test.csv', index_col='ID')
train.head()

In [None]:
ret=[f'RET_{k}' for k in range(20,0,-1)]
vol=[f'VOLUME_{k}' for k in range(20,0,-1)]

# Exploration des secteurs

Nombre de points par secteur :

In [None]:
nb_sec = len(train.SECTOR.unique())

In [None]:
train.SECTOR.hist(bins=nb_sec)

In [None]:
train.groupby(by='SECTOR').count()['DATE']

# Cross validation

In [None]:
# Paramètres :
N = 4 #nombre de folds
nb_feat = 20 #nb de jours pris en compte
nb_shifts = 10 #nombre de features supplémentaires

# Séparation en N folds :
scores=[]
kf = KFold(n_splits=N,shuffle=True,random_state=1)
for fold, (ind_train, ind_test) in enumerate(kf.split(train)):
    
    # Nettoyage des données :
    train_cl = train.loc[ind_train]
    test_cl = train.loc[ind_test]
    for sect in range(nb_sec):
        for ret_t,vol_t in zip(ret,vol):
            med_ret = train_cl[ret_t][train_cl['SECTOR']==sect].median()
            med_vol = train_cl[vol_t][train_cl['SECTOR']==sect].median()
            train_cl[ret_t] = train_cl[ret_t].fillna(med_ret)
            train_cl[vol_t] = train_cl[vol_t].fillna(med_vol)
            test_cl[ret_t] = test_cl[ret_t].fillna(med_ret)
            test_cl[vol_t] = test_cl[vol_t].fillna(med_vol)

    # Mise à l'échelle :
    scaler = StandardScaler()
    train_cl[ret+vol] = scaler.fit_transform(train_cl[ret+vol])
    test_cl[ret+vol] = scaler.fit_transform(test_cl[ret+vol])

    # Sélection des features :
    features_base = ret[-nb_feat:]+vol[-nb_feat:]
    target = 'RET'
    new_features = []
    shifts = range(1,nb_shifts+1)
    statistics = ['median']
    gb_features = ['INDUSTRY_GROUP', 'DATE']
    target_feature = 'RET'
    tmp_name = '_'.join(gb_features)
    for shift in shifts:
        for stat in statistics:
            name = f'{target_feature}_{shift}_{tmp_name}_{stat}'
            feat = f'{target_feature}_{shift}'
            new_features.append(name)
            for data in [train_cl,test_cl]:
                data[name] = data.groupby(gb_features)[feat].transform(stat)
    features = features_base + new_features
    
    # Entraînement des modèles :
    models = []
    for sect in range(nb_sec):
        x_train_cl = train_cl[features][train_cl['SECTOR']==sect]
        y_train_cl = train_cl[target][train_cl['SECTOR']==sect]
        rf_params = {
        'n_estimators': 500,
        'max_depth': 2**3,
        'random_state': 0,
        'n_jobs': -1
        }
        model = RandomForestClassifier(**rf_params)
        model.fit(x_train_cl, y_train_cl)
        models.append(model)
        print(f"Fold {fold+1} - Modèle {sect} entraîné")

    # Prédiction :
    y_pred = pd.Series()
    index0 = test_cl.index
    for sect in range(nb_sec):
        x_test = test_cl.loc[test_cl['SECTOR']==sect].loc[:,features]
        index = x_test.index
        y_test = test_cl.loc[test_cl['SECTOR']==sect].loc[:,target]
        model = models[sect]
        y_pred = pd.concat([y_pred,pd.Series(model.predict(x_test),index=x_test.index)])
    y_pred = y_pred[index0]
    score = accuracy_score(test_cl[target], y_pred)
    print(f"Score fold {fold+1} : {score* 100:.2f}")
    scores.append(score)

# Résultats finaux :
mean = np.mean(scores)*100
std = np.std(scores)*100
u = (mean + std)
l = (mean - std)
print(f'Accuracy: {mean:.2f}% [{l:.2f} ; {u:.2f}] (+- {std:.2f})')

# Submit

In [None]:
# Paramètres :
nb_feat = 20 #nb de jours pris en compte
nb_shifts = 10 #nombre de features supplémentaires

# Nettoyage des données :
train_cl = train.copy()
test_cl = test.copy()
for sect in range(nb_sec):
    for ret_t,vol_t in zip(ret,vol):
        med_ret = train_cl[ret_t][train_cl['SECTOR']==sect].median()
        med_vol = train_cl[vol_t][train_cl['SECTOR']==sect].median()
        train_cl[ret_t] = train_cl[ret_t].fillna(med_ret)
        train_cl[vol_t] = train_cl[vol_t].fillna(med_vol)
        test_cl[ret_t] = test_cl[ret_t].fillna(med_ret)
        test_cl[vol_t] = test_cl[vol_t].fillna(med_vol)

# Mise à l'échelle :
scaler = StandardScaler()
train_cl[ret+vol] = scaler.fit_transform(train_cl[ret+vol])
test_cl[ret+vol] = scaler.fit_transform(test_cl[ret+vol])

# Sélection des features :
features_base = ret[-nb_feat:]+vol[-nb_feat:]
target = 'RET'
new_features = []
shifts = range(1,nb_shifts+1)
statistics = ['median']
gb_features = ['INDUSTRY_GROUP', 'DATE']
target_feature = 'RET'
tmp_name = '_'.join(gb_features)
for shift in shifts:
    for stat in statistics:
        name = f'{target_feature}_{shift}_{tmp_name}_{stat}'
        feat = f'{target_feature}_{shift}'
        new_features.append(name)
        for data in [train_cl,test_cl]:
            data[name] = data.groupby(gb_features)[feat].transform(stat)
features = features_base + new_features

# Entraînement des modèles :
models = []
for sect in range(nb_sec):
    x_train_cl = train_cl[features][train_cl['SECTOR']==sect]
    y_train_cl = train_cl[target][train_cl['SECTOR']==sect]
    rf_params = {
    'n_estimators': 500,
    'max_depth': 2**3,
    'random_state': 0,
    'n_jobs': -1
    }
    model = RandomForestClassifier(**rf_params)
    model.fit(x_train_cl, y_train_cl)
    models.append(model)
    print(f"Modèle {sect} entraîné")

# Prédiction :
y_pred = pd.Series()
index0 = test_cl.index
for sect in range(nb_sec):
    x_test = test_cl.loc[test_cl['SECTOR']==sect].loc[:,features]
    index = x_test.index
    model = models[sect]
    y_pred = pd.concat([y_pred,pd.Series(model.predict(x_test),index=x_test.index)])
y_pred = y_pred[index0]

In [None]:
y_pred.name = target
y_pred.to_csv('./sub1110F2.csv', index=True, header=True)

Paramètres : nb_feat = 20, nb_shifts = 10

Public score : 50.51%