In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import QuantileTransformer, FunctionTransformer
from sklearn.impute import SimpleImputer, MissingIndicator

from category_encoders import TargetEncoder, CountEncoder

from lightgbm.sklearn import LGBMClassifier

import warnings

warnings.filterwarnings("ignore")


In [2]:
X = pd.read_csv("Data/amf_train_x.csv", index_col = 'Index')
y = pd.read_csv("Data/amf_train_y.csv")

In [3]:
y = X.merge(y, on = 'Trader')['type'].replace(['NON HFT', 'MIX', 'HFT'], [0, 1 , 2])
y.index = X.index

In [4]:
def na_counter(x):
    return x.isna().sum(axis = 1).values.reshape((-1, 1))


NACounter = FunctionTransformer(na_counter)
imputer = SimpleImputer(strategy = "constant", fill_value = -1)
na_indicator = MissingIndicator()

na_indicator.fit_transform(X)

array([[False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       ...,
       [False,  True, False, ...,  True,  True,  True],
       [False,  True, False, ...,  True,  True,  True],
       [False,  True, False, ...,  True,  True,  True]])

In [5]:
y

Index
1         0
2         0
3         0
4         0
5         0
         ..
105778    0
105779    0
105780    0
105781    0
105782    0
Name: type, Length: 105782, dtype: int64

In [6]:
class MultiClassTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y):
        y = pd.get_dummies(y)
        self.encoders = [TargetEncoder().fit(X, y[y_i]) for y_i in y.columns]
        return self
    
    def transform(self, X):
        return pd.concat([encoder.transform(X) for encoder in self.encoders], axis = 1).values
    
    
        
mcte = MultiClassTargetEncoder()

mcte.fit_transform(X[['Day']], y)   

array([[0.20165686, 0.49596686, 0.30237628],
       [0.22485089, 0.47534791, 0.29980119],
       [0.20699831, 0.48482293, 0.30817875],
       ...,
       [0.22307848, 0.48042993, 0.29649158],
       [0.20173985, 0.48156587, 0.31669428],
       [0.19324821, 0.49880978, 0.307942  ]])

In [7]:
preprocessing = Pipeline([
    ("cleaning", FeatureUnion([
        ("na_counter", NACounter),
        ("impute", imputer),
        ("missing_indicator", na_indicator),
        ("category_encoding", ColumnTransformer([
            ('day_target_encoding', MultiClassTargetEncoder(), 'Day'),
            ('share_target_encoding', MultiClassTargetEncoder(), 'Share'),
            ('day_count_encoding', CountEncoder(handle_unknown = 0, min_group_size = 0, handle_missing = 0), 'Day'),
            ('share_count_encoding', CountEncoder(handle_unknown = 0, min_group_size = 0, handle_missing = 0), 'Share')],
            remainder = "drop"))
    ])),
    ("drop_trader_date_share", ColumnTransformer([
        ("drop_columns", "drop", [1, 2, 3])
    ], remainder = "passthrough")),
    ("scaling", QuantileTransformer())
])

In [8]:
_ = preprocessing.fit_transform(X, y)

In [9]:
pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("model", LGBMClassifier())
    
])

In [10]:
pipeline.fit(X,y)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('cleaning',
                                  FeatureUnion(transformer_list=[('na_counter',
                                                                  FunctionTransformer(func=<function na_counter at 0x0000024CECEC0670>)),
                                                                 ('impute',
                                                                  SimpleImputer(fill_value=-1,
                                                                                strategy='constant')),
                                                                 ('missing_indicator',
                                                                  MissingIndicator()),
                                                                 ('category_encoding',
                                                                  ColumnTransformer(transformers=[('day_target_encoding'...
                                           

# Train Test Split

In [11]:
test_traders = X['Trader'].value_counts().sample(15)

In [12]:
X_train = X.reset_index().set_index("Trader").drop(test_traders.index).reset_index().set_index('Index')
X_test = X.reset_index().set_index("Trader").loc[test_traders.index].reset_index().set_index('Index')

y_train = y.loc[X_train.index]
y_test = y.loc[X_test.index]

# Test Pipeline

In [13]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

f1_score(y_test, y_pred, average = 'weighted')

0.9168661119240276