In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import QuantileTransformer, FunctionTransformer
from sklearn.impute import SimpleImputer, MissingIndicator

from category_encoders import TargetEncoder, CountEncoder

from lightgbm.sklearn import LGBMClassifier

import warnings

warnings.filterwarnings("ignore")


In [2]:
X = pd.read_csv("Data/amf_train_x.csv", index_col = 'Index')
y = pd.read_csv("Data/amf_train_y.csv")

In [3]:
y = X.merge(y, on = 'Trader')['type'].replace(['NON HFT', 'MIX', 'HFT'], [0, 1 , 2])
y.index = X.index

In [4]:
def na_counter(x):
    return x.isna().sum(axis = 1).values.reshape((-1, 1))


NACounter = FunctionTransformer(na_counter)
imputer = SimpleImputer(strategy = "constant", fill_value = -1)
na_indicator = MissingIndicator()

na_indicator.fit_transform(X)

array([[False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       ...,
       [False,  True, False, ...,  True,  True,  True],
       [False,  True, False, ...,  True,  True,  True],
       [False,  True, False, ...,  True,  True,  True]])

In [5]:
class MultiClassTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y):
        y = pd.get_dummies(y)
        self.encoders = [TargetEncoder().fit(X, y[y_i]) for y_i in y.columns]
        return self
    
    def transform(self, X):
        return pd.concat([encoder.transform(X) for encoder in self.encoders], axis = 1).values
    
    
        
mcte = MultiClassTargetEncoder()

mcte.fit_transform(X[['Day']], y)   

array([[0.20165686, 0.49596686, 0.30237628],
       [0.22485089, 0.47534791, 0.29980119],
       [0.20699831, 0.48482293, 0.30817875],
       ...,
       [0.22307848, 0.48042993, 0.29649158],
       [0.20173985, 0.48156587, 0.31669428],
       [0.19324821, 0.49880978, 0.307942  ]])

In [6]:
preprocessing = Pipeline([
    ("cleaning", FeatureUnion([
        ("na_counter", NACounter),
        ("impute", imputer),
        ("missing_indicator", na_indicator),
        ("category_encoding", ColumnTransformer([
            ('day_target_encoding', MultiClassTargetEncoder(), 'Day'),
            ('share_target_encoding', MultiClassTargetEncoder(), 'Share'),
            ('day_count_encoding', CountEncoder(handle_unknown = 0, min_group_size = 0, handle_missing = 0), 'Day'),
            ('share_count_encoding', CountEncoder(handle_unknown = 0, min_group_size = 0, handle_missing = 0), 'Share')],
            remainder = "drop"))
    ])),
    ("drop_trader_date_share", ColumnTransformer([
        ("drop_columns", "drop", [1, 2, 3])
    ], remainder = "passthrough")),
    ("scaling", QuantileTransformer())
])

In [7]:
_ = preprocessing.fit_transform(X, y)

In [8]:
pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("model", LGBMClassifier())
    
])

In [9]:
pipeline.fit(X,y)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('cleaning',
                                  FeatureUnion(transformer_list=[('na_counter',
                                                                  FunctionTransformer(func=<function na_counter at 0x000001A4E1902670>)),
                                                                 ('impute',
                                                                  SimpleImputer(fill_value=-1,
                                                                                strategy='constant')),
                                                                 ('missing_indicator',
                                                                  MissingIndicator()),
                                                                 ('category_encoding',
                                                                  ColumnTransformer(transformers=[('day_target_encoding'...
                                           

# Train Test Split

In [10]:
test_traders = X['Trader'].value_counts().sample(15)

In [11]:
X_train = X.reset_index().set_index("Trader").drop(test_traders.index).reset_index().set_index('Index')
X_test = X.reset_index().set_index("Trader").loc[test_traders.index].reset_index().set_index('Index')

y_train = y.loc[X_train.index]
y_test = y.loc[X_test.index]

# Test Pipeline

In [12]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

f1_score(y_test, y_pred, average = 'weighted')

0.8458990520552311

In [15]:
y_train.value_counts()

1    35818
2    29680
0    17588
Name: type, dtype: int64

# Ré-équilibrage du dataset

In [16]:
from imblearn.over_sampling import RandomOverSampler 

ros = RandomOverSampler(random_state=42)

X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Recherche d'hyperparamètres optimaux

In [27]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

def objective(trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 10)
    n_estimators = trial.suggest_int("n_estimators", 1, 400)
    max_depth = trial.suggest_int("max_depth", 2, 10)
    min_split_gain = trial.suggest_uniform("min_split_gain", 0, 1)
    
    model = LGBMClassifier(learning_rate = learning_rate,
                           n_estimators = n_estimators,
                           max_depth = max_depth,
                           min_split_gain = min_split_gain,
                           device = 'gpu')
    
    pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("model", LGBMClassifier())   
    ])
    
    score = cross_val_score(pipeline, X_train_resampled, y_train_resampled, cv = 4).mean()

    return score

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials=100)

study.best_params

[32m[I 2021-06-10 17:36:47,280][0m A new study created in memory with name: no-name-dcb96402-a1b1-42d6-a4d7-60b3d06ae301[0m
[32m[I 2021-06-10 17:37:04,382][0m Trial 0 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 1.1360102452447238, 'n_estimators': 241, 'max_depth': 3, 'min_split_gain': 0.7883870425306815}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 17:37:21,537][0m Trial 1 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.5400805244627362, 'n_estimators': 124, 'max_depth': 3, 'min_split_gain': 0.6208330014976596}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 17:37:38,572][0m Trial 2 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 9.973919273634323, 'n_estimators': 307, 'max_depth': 9, 'min_split_gain': 0.9170029538893112}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 17:37:55,567][0m Trial 3 finished with value: 0.91318

[32m[I 2021-06-10 17:45:24,101][0m Trial 30 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.00017111830031796296, 'n_estimators': 294, 'max_depth': 6, 'min_split_gain': 0.9340579723351888}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 17:45:41,203][0m Trial 31 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.11436812198127232, 'n_estimators': 323, 'max_depth': 10, 'min_split_gain': 0.07702207212777973}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 17:45:58,570][0m Trial 32 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 1.0452264624897332, 'n_estimators': 368, 'max_depth': 10, 'min_split_gain': 0.5833554405298194}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 17:46:16,168][0m Trial 33 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.09513894770714443, 'n_estimators': 186, 'max_depth': 6, 'min_split

[32m[I 2021-06-10 17:54:09,534][0m Trial 61 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.01858324721481531, 'n_estimators': 391, 'max_depth': 2, 'min_split_gain': 0.5461061257683217}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 17:54:26,778][0m Trial 62 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.000989954613856904, 'n_estimators': 260, 'max_depth': 3, 'min_split_gain': 0.2973558804088348}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 17:54:44,798][0m Trial 63 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.009358108320323771, 'n_estimators': 249, 'max_depth': 3, 'min_split_gain': 0.36079799569819326}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 17:55:05,240][0m Trial 64 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 2.0341484457875114, 'n_estimators': 212, 'max_depth': 4, 'min_split_ga

[32m[I 2021-06-10 18:03:56,420][0m Trial 92 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.0008969709461545577, 'n_estimators': 207, 'max_depth': 4, 'min_split_gain': 0.22801075115944255}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 18:04:17,097][0m Trial 93 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.002532794631255194, 'n_estimators': 217, 'max_depth': 4, 'min_split_gain': 0.3111664631516505}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 18:04:36,994][0m Trial 94 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.0003646445032307404, 'n_estimators': 265, 'max_depth': 3, 'min_split_gain': 0.45505362825590134}. Best is trial 0 with value: 0.9131811649178944.[0m
[32m[I 2021-06-10 18:04:57,283][0m Trial 95 finished with value: 0.9131811649178944 and parameters: {'learning_rate': 0.0010526222123963319, 'n_estimators': 225, 'max_depth': 2, 'min_s

{'learning_rate': 1.1360102452447238,
 'n_estimators': 241,
 'max_depth': 3,
 'min_split_gain': 0.7883870425306815}

In [18]:
study.optimize(objective, n_trials=100)

In [13]:
from sklearn.model_selection import cross_val_score