In [1]:
import gc
import copy
from tqdm import tqdm
import logging

import pandas as pd
import numpy as np

from optiver_trading_at_the_close.feature_engineering import FE
from optiver_trading_at_the_close.column_selector import ColumnSelector
from optiver_trading_at_the_close.memory_reduction import MemoryReduction
from optiver_trading_at_the_close.mean_regressor_ensemble import MeanRegressorEnsemble

from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline

import lightgbm as lgb
import catboost as cb

## Hard Variables

In [2]:
DATA_PATH = './../data/train.csv'

HYPERPARAMETERS = {
    'objective': 'mae',
    
    'boosting_type': 'gbdt',
    'max_bin': 256,
    'learning_rate': 0.015,
    'max_depth': 12,
    'n_estimators': 10000, #1400,
    'num_leaves': 300,
    'reg_alpha': 0.005,
    'reg_lambda': 0.001,
    'colsample_bytree': 0.6,
    'subsample': 0.875,
    'min_child_samples': 128,
    
    'random_state': 42,
    'n_jobs':-1,
    'verbose':-1
}


## Read Data

In [3]:
df = pd.read_csv(DATA_PATH)

In [4]:
df.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


## Feature Engineering

In [5]:
df = df.dropna(subset=['target'], axis=0)

X_train = df.loc[df['date_id'] <= 420]
X_test = df.loc[df['date_id'] > 420]

y_train = X_train['target']
# X_train = X_train.drop(columns='target')

y_test = X_test['target']
# X_test = X_test.drop(columns='target')

In [6]:
del df 
gc.collect()

0

In [7]:
pipeline = Pipeline(steps=[
    ('fe', FE()),
    # ('nn_features', NearestNeighborsFeatures(
    #     features_to_use_for_distance_computation=['seconds_in_bucket', 'wap', 'bid_plus_ask_sizes', 'bid_ask_size_imb'],
    #     get_target=True,
    #     features_get=['wap', 'bid_ask_size_imb'],
    #     n_neighbors=[40],
    #     metrics=['l1'],
    #     n_jobs=-1
    # )),
    ('column_selector', ColumnSelector(cols_to_drop=['time_id', 'row_id', 'date_id', 'target'])),
    ('memore_reduction', MemoryReduction()),
    ('mean_regressor_ensemble', MeanRegressorEnsemble([lgb.LGBMRegressor(n_estimators=1, max_depth=2)]))
])

pipeline.fit(X_train, y_train)

100%|██████████| 1/1 [06:14<00:00, 374.92s/it]


In [8]:
for step in pipeline.steps[:-1]:
    X_train = step[1].transform(X_train)
    X_test = step[1].transform(X_test)

## RFE

In [9]:
def create_model(model, hyperparamters):
    if model == 'lightgbm':
        return lgb.LGBMRegressor(**hyperparamters)
    elif model == 'catboost':
        return cb.CatBoostRegressor(**hyperparamters)
    
    raise ValueError(f'Not valid model. Valid models are \'lightgbm\'.')

In [10]:
def get_booster_importance(lightgbm):
    importance_types = ['gain', 'split']
    
    booster = lightgbm.booster_
    
    results = pd.DataFrame(index=booster.feature_name())
    
    for importance_type in importance_types:
        importance_type_order = pd.DataFrame(
            {importance_type:booster.feature_importance(importance_type=importance_type)},
            index = booster.feature_name()
        )

        importance_type_order = importance_type_order.sort_values(by=importance_type, ascending=False)
        importance_type_order[f'{importance_type}_importance'] = np.arange(1, importance_type_order.shape[0]+1)
        importance_type_order.loc[importance_type_order[importance_type]==0, f'{importance_type}_importance'] = importance_type_order.shape[0]+1000
        
        if importance_type == 'gain':
            importance_type_order.loc[importance_type_order[importance_type]>=500, f'{importance_type}_importance'] = 0
        
        results.loc[results.index, f'{importance_type}_importance'] = importance_type_order.loc[results.index, f'{importance_type}_importance']
        
    results['average_importance'] = results.mean(axis=1)
    
    return results.sort_values(by='average_importance', ascending=True)


def get_booster_importance_damped_mean(lightgbm, damping_factor=10):
    booster = lightgbm.booster_
    
    results = pd.DataFrame(
        {
            'gain':booster.feature_importance(importance_type='gain'),
            'split':booster.feature_importance(importance_type='split')
        },
        index = booster.feature_name()
    )
    
    results['simple_mean'] = results['gain'] / results['split']
    
    global_mean = results['gain'].sum() / results['split'].sum()
    damped_numerator = results['gain'] + damping_factor * global_mean
    damped_denominator = results['split'] + damping_factor
    results['damped_mean'] = damped_numerator/damped_denominator
    
    results.loc[results['split']==0, 'damped_mean'] = 0
    
    return results.sort_values(by='damped_mean', ascending=False)

In [11]:
def RFE(X_train, y_train, X_test, y_test, model, nr_feats_to_remove_per_iter=25):
    logging.basicConfig(
        filename='RFE_Results',
        filemode='a',
        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
        datefmt='%H:%M:%S',
        level=logging.DEBUG
    )

    logger = logging.getLogger('RFE_Results')
    
    for iteration in tqdm(range(int(X_train.shape[1] // nr_feats_to_remove_per_iter))):
        logger.info(f'------------------------------ ITERATION {iteration} ------------------------------')
        iteration_model = copy.deepcopy(model)
        
        logger.info(f'Training model with following {X_train.shape[1]} vars: {list(X_train.columns)}')
        iteration_model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            callbacks=[lgb.callback.early_stopping(stopping_rounds=250)],
            eval_metric='mae'
        )
        
        logger.info(f'Computing test mae')
        test_mae = mean_absolute_error(y_test, iteration_model.predict(X_test))
        logger.info(f'TEST MAE = {test_mae}')
    
        logger.info(f'starting PFI computation')
        result = get_booster_importance_damped_mean(iteration_model)
        
        feats_to_drop = result.index[-nr_feats_to_remove_per_iter:]
        
        logger.info(f'Dropping worst {nr_feats_to_remove_per_iter} features: {list(feats_to_drop)}')
        
        X_train.drop(feats_to_drop, axis=1, inplace=True)
        X_test.drop(feats_to_drop, axis=1, inplace=True)
        
    

In [12]:
estimator = create_model('lightgbm', HYPERPARAMETERS)

RFE(
    X_train,
    y_train,
    X_test,
    y_test,
    estimator,
)

  0%|          | 0/24 [00:00<?, ?it/s]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1229]	valid_0's l1: 5.67766
48.81903678943885


  4%|▍         | 1/24 [1:54:09<43:45:29, 6849.12s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1473]	valid_0's l1: 5.67676
44.68129324850854


  8%|▊         | 2/24 [11:14:29<138:05:32, 22596.92s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1408]	valid_0's l1: 5.67765
45.73917579594461


 12%|█▎        | 3/24 [12:31:46<83:58:35, 14395.95s/it] 

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1464]	valid_0's l1: 5.677
44.754645376230265


 17%|█▋        | 4/24 [14:29:59<63:57:37, 11512.85s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1437]	valid_0's l1: 5.67691
45.30267655430358


 21%|██        | 5/24 [16:42:55<54:01:50, 10237.38s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1641]	valid_0's l1: 5.67677
42.35273588498378


 25%|██▌       | 6/24 [18:21:48<43:52:14, 8774.13s/it] 

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1542]	valid_0's l1: 5.67758
43.6545839336061


 29%|██▉       | 7/24 [19:34:20<34:36:21, 7328.30s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1165]	valid_0's l1: 5.67795
49.96417683737637


 33%|███▎      | 8/24 [20:16:20<25:45:59, 5797.49s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1588]	valid_0's l1: 5.67722
42.697855648668245


 38%|███▊      | 9/24 [21:11:17<20:53:59, 5015.95s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1748]	valid_0's l1: 5.67751
40.69955710459465


 42%|████▏     | 10/24 [22:10:12<17:43:40, 4558.62s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1567]	valid_0's l1: 5.67907
42.502440811822936


 46%|████▌     | 11/24 [22:57:15<14:32:38, 4027.55s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1618]	valid_0's l1: 5.67825
41.76796886062487


 50%|█████     | 12/24 [23:41:25<12:01:41, 3608.49s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1645]	valid_0's l1: 5.67862
41.13949511202073


 54%|█████▍    | 13/24 [24:20:52<9:52:36, 3232.43s/it] 

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1292]	valid_0's l1: 5.68004
45.884101232865625


 58%|█████▊    | 14/24 [24:51:14<7:47:42, 2806.29s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1829]	valid_0's l1: 5.68135
37.8020784823685


 62%|██████▎   | 15/24 [25:23:54<6:22:40, 2551.13s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1458]	valid_0's l1: 5.68245
42.35951670903061


 67%|██████▋   | 16/24 [25:49:33<4:59:31, 2246.45s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1658]	valid_0's l1: 5.68116
39.350650581313694


 71%|███████   | 17/24 [26:14:55<3:56:40, 2028.62s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1318]	valid_0's l1: 5.68197
43.87670894774814


 75%|███████▌  | 18/24 [26:34:20<2:56:55, 1769.21s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1360]	valid_0's l1: 5.68125
42.39959035668813


 79%|███████▉  | 19/24 [26:52:04<2:09:46, 1557.27s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1148]	valid_0's l1: 5.68148
46.10626842967198


 83%|████████▎ | 20/24 [27:06:13<1:29:39, 1344.84s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1619]	valid_0's l1: 5.68081
37.211004501799366


 88%|████████▊ | 21/24 [27:21:21<1:00:41, 1213.77s/it]

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1428]	valid_0's l1: 5.68158
39.098387966468216


 92%|█████████▏| 22/24 [27:33:13<35:26, 1063.12s/it]  

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[1324]	valid_0's l1: 5.69459
37.347075128995186


 96%|█████████▌| 23/24 [27:42:23<15:08, 908.94s/it] 

Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[843]	valid_0's l1: 5.70317
45.40247081088157


100%|██████████| 24/24 [27:47:28<00:00, 4168.69s/it]
