In [1]:
import time
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt

from copy import deepcopy
from collections import Counter
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization

In [2]:
sns.set()
%matplotlib inline
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")

In [3]:
%%time
df_input_data = pd.read_csv('data_for_boosting.csv')

CPU times: user 10.4 s, sys: 812 ms, total: 11.2 s
Wall time: 10.6 s


In [4]:
df_input_data.head(3)

Unnamed: 0,volatility_atr,volume_adi,momentum_tsi,momentum_rsi,momentum_stoch,momentum_wr,returns_lag_1,minute,"value__fft_coefficient__coeff_0__attr_""abs""",close_returns_log,...,"value__fft_coefficient__coeff_2__attr_""angle""","value__fft_coefficient__coeff_8__attr_""abs""",value__number_crossing_m__m_0,value__c3__lag_2,"value__fft_aggregated__aggtype_""variance""",returns_25,returns_16,returns_26,datetime,target
0,2.232631,-62.084025,1.334591,76.659155,96.774194,-3.225806,0.000282,11,0.000283,0.0,...,162.231277,0.000309,26.0,-1.148034e-15,205.975487,0.001553,0.00047,0.001506,2019-07-21 05:11:34.542501,2
1,2.108871,-91.595042,2.166042,73.108003,93.548387,-6.451613,0.0,11,0.000309,-5e-06,...,170.244199,0.000323,27.0,-1.255613e-15,210.047304,0.001742,0.000329,0.001506,2019-07-21 05:11:47.914567,2
2,1.993952,7.445769,2.978572,74.472467,96.774194,-3.225806,-4.7e-05,13,0.000324,5e-06,...,177.708002,0.000337,26.0,-1.263754e-15,208.077874,0.001553,0.000376,0.001789,2019-07-21 05:13:10.418727,2


In [5]:
print(sum(df_input_data['target'] == 0))
print(sum(df_input_data['target'] == 1))
print(sum(df_input_data['target'] == 2))

295451
167440
360531


In [6]:
# Класс 0 против остальных
data_0_vs_other = deepcopy(df_input_data)
data_0_vs_other['target'].replace([1, 2], -1, inplace=True)
data_0_vs_other['target'].replace(0, 1, inplace=True)
data_0_vs_other['target'].replace(-1, 0, inplace=True)

In [7]:
# Класс 1 против остальных
data_1_vs_other = deepcopy(df_input_data)
data_1_vs_other['target'].replace([0, 2], -1, inplace=True)
data_1_vs_other['target'].replace(-1, 0, inplace=True)

In [8]:
# Класс 2 против остальных
data_2_vs_other = deepcopy(df_input_data)
data_2_vs_other['target'].replace([0, 1], -1, inplace=True)
data_2_vs_other['target'].replace(2, 1, inplace=True)
data_2_vs_other['target'].replace(-1, 0, inplace=True)

In [9]:
# Проверка количество объектов классов
print(sum(data_2_vs_other['target'] == 1))
print(sum(data_2_vs_other['target'] == 0))

360531
462891


In [10]:
train_0 = data_0_vs_other[data_0_vs_other['datetime'] < '2019-09-25']
train_1 = data_1_vs_other[data_1_vs_other['datetime'] < '2019-09-25']
train_2 = data_2_vs_other[data_2_vs_other['datetime'] < '2019-09-25']

In [11]:
test_0 = data_0_vs_other[data_0_vs_other['datetime'] >= '2019-09-25']
test_1 = data_1_vs_other[data_1_vs_other['datetime'] >= '2019-09-25']
test_2 = data_2_vs_other[data_2_vs_other['datetime'] >= '2019-09-25']

In [12]:
X_0 = train_0.drop(['target', 'datetime'], axis=1)
X_1 = train_1.drop(['target', 'datetime'], axis=1)
X_2 = train_1.drop(['target', 'datetime'], axis=1)

y_0 = train_0['target']
y_1 = train_1['target']
y_2 = train_2['target']

In [13]:
X_0_test = test_0.drop(['target', 'datetime'], axis=1)
X_1_test = test_1.drop(['target', 'datetime'], axis=1)
X_2_test = test_2.drop(['target', 'datetime'], axis=1)

y_0_test = test_0['target']
y_1_test = test_1['target']
y_2_test = test_2['target']

In [14]:
def lgb_eval(max_depth, 
             num_leaves,
             reg_alpha,
             reg_lambda):
    
    params = {'objective':'binary',
              'num_iterations':4000, 
              'learning_rate':0.05, 
              'early_stopping_round':100, 
              'metric':'auc'}
    
    params['max_depth'] = round(max_depth)
    params['num_leaves'] = round(num_leaves)
    params['reg_alpha'] = max(reg_alpha, 0)
    params['reg_lambda'] = max(reg_lambda, 0)
    
    cv_result = lgb.cv(params, 
                       train_data, 
                       nfold=n_folds, 
                       seed=random_seed, 
                       stratified=True, 
                       verbose_eval =200, 
                       metrics=['auc'])
    
    return max(cv_result['auc-mean'])

In [15]:
lgbBO = BayesianOptimization(lgb_eval, {'max_depth': (5, 8.99),
                                        'num_leaves': (24, 45),                                        
                                        'reg_alpha': (0, 5),
                                        'reg_lambda': (0, 3)}, 
                             random_state=0)

In [16]:
def bayes_parameter_opt_lgb(X, 
                            y, 
                            init_round=15, 
                            opt_round=25, 
                            n_folds=5, 
                            random_seed=6, 
                            n_estimators=10000, 
                            learning_rate=0.05, 
                            output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, 
                             label=y, 
                             free_raw_data=False)
    # parameters
    
    def lgb_eval(max_depth,
                 num_leaves,
                 reg_alpha,
                 reg_lambda):
        
        params = {'objective':'binary',
                  'num_iterations': n_estimators, 
                  'learning_rate':learning_rate, 
                  'early_stopping_round':100, 
                  'metric':'auc'}
        
        params['max_depth'] = int(round(max_depth))
        params['num_leaves'] = int(round(num_leaves))
        params['reg_alpha'] = max(reg_alpha, 0)
        params['reg_lambda'] = max(reg_lambda, 0)
        
        cv_result = lgb.cv(params, 
                           train_data, 
                           nfold=n_folds, 
                           seed=random_seed,
                           stratified=True, 
                           verbose_eval =200, 
                           metrics=['auc'])
        return max(cv_result['auc-mean'])
    
    # range
    
    lgbBO = BayesianOptimization(lgb_eval, {'max_depth': (5, 8.99),
                                            'num_leaves': (24, 45),
                                            'reg_alpha': (0, 5),
                                            'reg_lambda': (0, 3)}, 
                             random_state=0)
    
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
    
    return lgbBO

In [17]:
opt_params = bayes_parameter_opt_lgb(X_2, 
                                     y_2, 
                                     init_round=5, 
                                     opt_round=10, 
                                     n_folds=3, 
                                     random_seed=6, 
                                     n_estimators=100, 
                                     learning_rate=0.05)

|   iter    |  target   | max_depth | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.2132  [0m | [0m 7.19    [0m | [0m 39.02   [0m | [0m 3.014   [0m | [0m 1.635   [0m |
| [95m 2       [0m | [95m 0.2133  [0m | [95m 6.69    [0m | [95m 37.56   [0m | [95m 2.188   [0m | [95m 2.675   [0m |
| [95m 3       [0m | [95m 0.2136  [0m | [95m 8.845   [0m | [95m 32.05   [0m | [95m 3.959   [0m | [95m 1.587   [0m |
| [0m 4       [0m | [0m 0.2128  [0m | [0m 7.266   [0m | [0m 43.44   [0m | [0m 0.3552  [0m | [0m 0.2614  [0m |
| [95m 5       [0m | [95m 0.2146  [0m | [95m 5.081   [0m | [95m 41.49   [0m | [95m 3.891   [0m | [95m 2.61    [0m |
| [95m 6       [0m | [95m 0.215   [0m | [95m 5.07    [0m | [95m 24.01   [0m | [95m 4.957   [0m | [95m 0.1629  [0m |
| [0m 7       [0m | [0m 0.2149  [0m | [0m 5.056   [0m | [0m 24.01   [0m | [0m 0.5153  

In [18]:
opt_params.max['params']

{'max_depth': 5.077962147614763,
 'num_leaves': 24.086236609046164,
 'reg_alpha': 4.917677030984656,
 'reg_lambda': 2.7393537072172807}

In [38]:
model_params_0 = {'max_depth': int(8.979840829430485),
                  'num_leaves': int(44.618691354216),
                  'boosting_type': 'gbdt',
                  'reg_alpha': 0.2282418956372323,
                  'reg_lambda': 0.059783627142429685
                 }

model_params_1 = {'max_depth': int(8.98131008415871),
                  'num_leaves': int(44.63581240702489),
                  'boosting_type': 'gbdt',
                  'reg_alpha': 0.13232465491703205,
                  'reg_lambda': 0.3189983143889408
                 }

model_params_2 = {'max_depth': int(8.92232352329105),
                  'num_leaves': int(44.699027026452086),
                  'boosting_type': 'gbdt',
                  'reg_alpha': 0.029777838712649607,
                  'reg_lambda': 0.2984579663785003
                 }
                            
optim_params = {'num_boost_round': 200,
                'num_iterations':4000,
                'learning_rate':0.05
               }

task_params = {'objective': 'binary',
               'metric': 'f1'
              }
    
device_params = {'device': 'cpu',
                 'num_threads': 4
                }

In [28]:
sm = SMOTE(random_state=42)

In [31]:
%%time
X_res_0, y_res_0 = sm.fit_resample(X_0, y_0)
print('Resampled dataset shape %s' % Counter(y_res_0))

Resampled dataset shape Counter({0: 314190, 1: 314190})


In [32]:
%%time
X_res_1, y_res_1 = sm.fit_resample(X_1, y_1)
print('Resampled dataset shape %s' % Counter(y_res_1))

Resampled dataset shape Counter({0: 393051, 1: 393051})
Wall time: 21.3 s


In [33]:
%%time
X_res_2, y_res_2 = sm.fit_resample(X_2, y_2)
print('Resampled dataset shape %s' % Counter(y_res_2))

Resampled dataset shape Counter({1: 276079, 0: 276079})
Wall time: 1min


In [39]:
total_dict_0 = dict(**model_params_0, **task_params, **device_params, **optim_params)
total_dict_1 = dict(**model_params_1, **task_params, **device_params, **optim_params)
total_dict_2 = dict(**model_params_2, **task_params, **device_params, **optim_params)

In [42]:
%%time
clf_0 = LGBMClassifier(**total_dict_0)
clf_1 = LGBMClassifier(**total_dict_1)
clf_2 = LGBMClassifier(**total_dict_2)

Wall time: 0 ns


In [43]:
%%time
clf_0.fit(X_res_0, y_res_0)

Wall time: 4min 44s


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        device='cpu', importance_type='split', learning_rate=0.05,
        max_depth=8, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_boost_round=200, num_iterations=4000, num_leaves=44,
        num_threads=4, objective='binary', random_state=None,
        reg_alpha=0.2282418956372323, reg_lambda=0.059783627142429685,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [45]:
%%time
clf_1.fit(X_res_1, y_res_1)

Wall time: 6min 6s


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        device='cpu', importance_type='split', learning_rate=0.05,
        max_depth=8, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_boost_round=200, num_iterations=4000, num_leaves=44,
        num_threads=4, objective='binary', random_state=None,
        reg_alpha=0.13232465491703205, reg_lambda=0.3189983143889408,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [46]:
%%time
clf_2.fit(X_res_2, y_res_2)

Wall time: 4min 8s


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        device='cpu', importance_type='split', learning_rate=0.05,
        max_depth=8, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_boost_round=200, num_iterations=4000, num_leaves=44,
        num_threads=4, objective='binary', random_state=None,
        reg_alpha=0.029777838712649607, reg_lambda=0.2984579663785003,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [52]:
%%time
y_pred_0 = clf_0.predict(X_0_test)
y_pred_1 = clf_1.predict(X_1_test)
y_pred_2 = clf_2.predict(X_2_test)

Wall time: 1min 8s


In [53]:
print(f1_score(y_0_test, y_pred_0))
print(f1_score(y_1_test, y_pred_1))
print(f1_score(y_2_test, y_pred_2))

0.40187012509620573
0.2967655341541279
0.5661416814697321


In [55]:
data_train = df_input_data[df_input_data['datetime'] < '2019-09-25']
data_test = df_input_data[df_input_data['datetime'] >= '2019-09-25']
X = data_train.drop(['target', 'datetime'], axis=1)
y = data_train['target']

In [56]:
%%time
X_res_1, y_res_1 = sm.fit_resample(X, y)

In [1]:
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import warnings

warnings.simplefilter('ignore')

lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf_0, clf_1, clf_2], 
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([clf_0, clf_1, clf_2, sclf], 
                      ['0', 
                       '1', 
                       '2',
                       'StackingClassifier']):

    scores = cross_val_score(clf, X_res_1, y_res_1, 
                                              cv=3, scoring='f1')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))