In [1]:
import datetime
import joblib
import itertools
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

from utils.metric_clf_hft import metric

import warnings
warnings.filterwarnings('ignore')

In [2]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)

SEED = 13
seed_everything(SEED)

# Data Preprocessing

In [3]:
cols = ['local_ts', 'target',
        'imb_bid_perp', 'imb_ask_perp', 'imb_bid_spot', 'imb_ask_spot',
        'tfi_100_perp', 'tfi_250_perp', 'tfi_500_perp', 'tfi_1000_perp', 'tfi_2000_perp',
        'tfi_100_spot', 'tfi_250_spot', 'tfi_500_spot', 'tfi_1000_spot', 'tfi_2000_spot',
        'pret_100_perp', 'pret_250_perp', 'pret_500_perp', 'pret_1000_perp', 'pret_2000_perp',
        'pret_100_spot', 'pret_250_spot', 'pret_500_spot', 'pret_1000_spot', 'pret_2000_spot',
        'mean_div_5s', 'mean_div_9s', 'mean_div_19s', 'mean_div_38s',
        'mean_div_75s', 'mean_div_150s', 'mean_div_300s', 'mean_div_600s']

In [4]:
df = pd.read_csv('data/df.csv', usecols=cols) # nrows=1_000_000

In [5]:
df['local_ts'] = pd.to_datetime(df['local_ts'])
df = df.set_index('local_ts').sort_index()

Deleting nans

In [6]:
df.dropna(axis=0, inplace=True)

In [7]:
X = df.drop(columns='target').values
y = df['target'].values
del df

# Time Series Cross Validation

In [8]:
def performTimeSeriesCV(X, y, model, metric, preprocessor=None, n_splits=5, kwargs={}):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    metrics = []
    for train_index, val_index in tqdm(tscv.split(X), total=tscv.get_n_splits(),
                                       desc='TimeSeriesCV', leave=False):
        X_train = X[train_index]
        X_val = X[val_index]
        y_train = y[train_index]
        y_val = y[val_index]
        
        if preprocessor:
            X_train = preprocessor.fit_transform(X_train)
            X_val = preprocessor.transform(X_val)
            model.fit(X_train, y_train, **kwargs)
        else:
            model.fit(X_train, y_train, **kwargs)
        
        metrics.append(metric(y_val, model.predict(X_val), confusion=True))
   
    return metrics

# Training models

## Baseline: Logistic Regression

In [9]:
preprocessor = StandardScaler()
logistic_regression = LogisticRegression(random_state=SEED)

metric_baseline_cv = performTimeSeriesCV(X, y, logistic_regression, metric, preprocessor, n_splits=4)

print(f'Mean metric of baseline on cross validation: {round(np.mean([x[0] for x in metric_baseline_cv]), 2)}')
print(f'\nconfusion matrix of last fold:\n{metric_baseline_cv[-1][1]}')

TimeSeriesCV:   0%|          | 0/4 [00:00<?, ?it/s]

Mean metric of baseline on cross validation: -390.2

confusion matrix of last fold:
[[ 262883 1789263  177482]
 [ 174429 2683674  166991]
 [ 198216 1805323  253069]]


### Hyperparameters tuning

Since we have a custom metric that pays more attention to classes 1 and -1, we want to take this into account during training, which can be achieved using **class_weight**. In addition, we will also search for otimal **Regularization Strength** (C).

In order to speed up the selection of optimal hyperparameters, we will select them on a third of all data

In [10]:
sample_size = X.shape[0] // 3
X_sample = X[:sample_size]
y_sample = y[:sample_size]

In [11]:
class_weights = [None, {1: 10, 0: 1, -1: 10}, {1: 100, 0: 1, -1: 100}, {1: 1000, 0: 1, -1: 1000}]
C_list = [10, 1, 0.1, 0.01]
gridsearch = list(itertools.product(class_weights, C_list))

gridsearch_results = []
preprocessor = StandardScaler()
for class_weight, C in tqdm(gridsearch):
    logistic_regression = LogisticRegression(random_state=SEED, class_weight=class_weight, C=C)
    metric_baseline_cv = performTimeSeriesCV(X_sample, y_sample,
                                             logistic_regression, metric, preprocessor, n_splits=3)
    gridsearch_results.append((np.mean([x[0] for x in metric_baseline_cv]), metric_baseline_cv[-1][1]))

  0%|          | 0/16 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

TimeSeriesCV:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
optim_index = np.argmax([x[0] for x in gridsearch_results])
optim_class_weight, optim_C = gridsearch[optim_index]
print(f'Optimal class weight: {optim_class_weight}, Optimal regularization strength: {optim_C}')

Optimal class weight: {1: 100, 0: 1, -1: 100}, Optimal regularization strength: 0.01


Let's compare on the whole dataset

In [21]:
preprocessor = StandardScaler()
logistic_regression = LogisticRegression(random_state=SEED, class_weight=optim_class_weight, C=optim_C)

metric_baseline_cv_final = performTimeSeriesCV(X, y, logistic_regression, metric, preprocessor, n_splits=4)

print('Mean metric of final baseline on cross validation: ' 
      f'{round(np.mean([x[0] for x in metric_baseline_cv_final]), 2)}')
print(f'\nconfusion matrix of last fold:\n{metric_baseline_cv_final[-1][1]}')

Mean metric of final baseline on cross validation: 359.78

confusion matrix of last fold:
[[1192181       0 1037447]
 [1393439       0 1631655]
 [ 965093       0 1291515]]


With the help of hyperparameters tuning, we managed to improve the metrics from **-390.2** to **359.78**

### Training on the whole dataset and saving the final model

In [27]:
preprocessor = StandardScaler()
logistic_regression = LogisticRegression(random_state=SEED, class_weight=optim_class_weight, C=optim_C)

X = preprocessor.fit_transform(X)
logistic_regression.fit(X, y)

joblib.dump(logistic_regression, 'models/logistic.joblib')

['models/logistic.joblib']

## Random Forest

In [None]:
pass

## CatBoost

In [None]:
pass

## Stacking

In [None]:
pass