In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
import os
import re
stock_ids = sorted(
    [int(re.sub('stock_id=', '', x)) for x in os.listdir('../book_train.parquet')])

In [3]:
train_targets = pd.read_csv("../train.csv")

In [4]:
stock_id = stock_ids[0]
book = pd.read_parquet('../book_train.parquet/stock_id=' + str(stock_id))
trades = pd.read_parquet('../trade_train.parquet/stock_id=' + str(stock_id))

In [5]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [6]:
def add_wap1(book):
    book['wap1'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (
              book['ask_size1'] + book['bid_size1'])

def add_wap2(book):
    book['wap2'] = (book['bid_price2'] * book['ask_size2'] + book['ask_price2'] * book['bid_size2']) / (
              book['ask_size2'] + book['bid_size2'])

def add_log_return1(book):
    book['log_price1'] = np.log(book['wap1'])
    book['log_return1'] = book.groupby(['time_id'])['log_price1'].diff()
    
def add_log_return2(book):
    book['log_price2'] = np.log(book['wap2'])
    book['log_return2'] = book.groupby(['time_id'])['log_price2'].diff()
    
def get_vol1(book):
    return book.groupby(['time_id'])[['log_return1']].apply(lambda x: np.sum(x**2)**0.5).rename(
    {'log_return1': 'vol1'})
def get_vol2(book):
    return book.groupby(['time_id'])[['log_return2']].apply(lambda x: np.sum(x**2)**0.5).rename(
    {'log_return2': 'vol2'})

In [10]:
def collect_vols(stock_ids, folder):
    vols = []
    for stock_id in tqdm(stock_ids):
        book = pd.read_parquet(folder + 'stock_id=' + str(stock_id))
        book = book[book.seconds_in_bucket > 300]
        add_wap1(book)
        add_wap2(book)
        add_log_return1(book)
        add_log_return2(book)
        vols_temp = pd.concat([get_vol1(book), get_vol2(book)], axis=1).reset_index()
        vols_temp['stock_id'] = stock_id
        vols.append(vols_temp)
    vols = pd.concat(vols)
    return vols

In [11]:
trades['snb_diff'] = trades.groupby('time_id')['seconds_in_bucket'].diff()

In [12]:
def collect_trades_stats(stock_ids, folder):
    trades_stats = []
    for stock_id in tqdm(stock_ids):
        trades = pd.read_parquet(folder + 'stock_id=' + str(stock_id))
        trades['snb_diff'] = trades.groupby('time_id')['seconds_in_bucket'].diff()
        #trades = trades[trades['seconds_in_bucket'] > 300]
        stats = trades.groupby(['time_id']).agg({
            'order_count': sum, 'size': sum, 'snb_diff': np.mean}).reset_index()
        stats['stock_id'] = stock_id
        trades_stats.append(stats)
        
    return pd.concat(trades_stats)

In [13]:
vols = collect_vols(stock_ids, "../book_train.parquet/")

100%|██████████| 112/112 [10:08<00:00,  5.44s/it]


In [14]:
trades_stats = collect_trades_stats(stock_ids, '../trade_train.parquet/')

100%|██████████| 112/112 [01:09<00:00,  1.60it/s]


In [15]:
merged = pd.merge(vols, train_targets, on=['time_id', 'stock_id'], how='inner')
merged = pd.merge(merged, trades_stats, on=['time_id', 'stock_id'], how='left')

In [16]:
from sklearn.model_selection import train_test_split

In [101]:
cols = ['log_return1', 'log_return2', 'order_count','size', 'snb_diff', 'stock_id']
#X_train, X_test, y_train, y_test = train_test_split(merged[cols], merged['target'])
time_ids = merged['time_id'].unique()
test_size = 0.25
test_time_ids = np.random.choice(range(len(time_ids)), size=int(len(time_ids) * test_size))
train_time_ids = np.delete(range(len(time_ids)), test_time_ids)

X_train = merged.loc[merged['time_id'].apply(lambda x: x in train_time_ids), cols]
y_train = merged.loc[merged['time_id'].apply(lambda x: x in train_time_ids), 'target']

X_test = merged.loc[merged['time_id'].apply(lambda x: x in test_time_ids), cols]
y_test = merged.loc[merged['time_id'].apply(lambda x: x in test_time_ids), 'target']

In [102]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [103]:
import lightgbm as lgb

In [104]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer

my_scorer = make_scorer(rmspe, greater_is_better=False)
def lgb_scorer(y_true, y_pred, weights):
    return 'rmspe', rmspe(y_true, y_pred), False

In [105]:
reg = lgb.LGBMRegressor(n_estimators=1000, 
                        learning_rate=0.02, num_leaves=7)
reg.fit(X_train, y_train, 
        sample_weight=1/np.square(y_train), 
        eval_set=(X_test.values, y_test.values),
        eval_metric=lgb_scorer)

[1]	valid_0's l2: 1.74896e-05	valid_0's rmspe: 0.5606
[2]	valid_0's l2: 1.71117e-05	valid_0's rmspe: 0.552817
[3]	valid_0's l2: 1.67467e-05	valid_0's rmspe: 0.545194
[4]	valid_0's l2: 1.63935e-05	valid_0's rmspe: 0.537806
[5]	valid_0's l2: 1.60653e-05	valid_0's rmspe: 0.530547
[6]	valid_0's l2: 1.57229e-05	valid_0's rmspe: 0.523437
[7]	valid_0's l2: 1.54156e-05	valid_0's rmspe: 0.51653
[8]	valid_0's l2: 1.51057e-05	valid_0's rmspe: 0.509802
[9]	valid_0's l2: 1.47912e-05	valid_0's rmspe: 0.503296
[10]	valid_0's l2: 1.45126e-05	valid_0's rmspe: 0.496884
[11]	valid_0's l2: 1.42068e-05	valid_0's rmspe: 0.490566
[12]	valid_0's l2: 1.39454e-05	valid_0's rmspe: 0.484475
[13]	valid_0's l2: 1.36765e-05	valid_0's rmspe: 0.478494
[14]	valid_0's l2: 1.34143e-05	valid_0's rmspe: 0.472733
[15]	valid_0's l2: 1.31886e-05	valid_0's rmspe: 0.467045
[16]	valid_0's l2: 1.29583e-05	valid_0's rmspe: 0.461492
[17]	valid_0's l2: 1.27352e-05	valid_0's rmspe: 0.456092
[18]	valid_0's l2: 1.25024e-05	valid_0's rm

[262]	valid_0's l2: 3.27831e-06	valid_0's rmspe: 0.264311
[263]	valid_0's l2: 3.27805e-06	valid_0's rmspe: 0.264295
[264]	valid_0's l2: 3.27494e-06	valid_0's rmspe: 0.264248
[265]	valid_0's l2: 3.27374e-06	valid_0's rmspe: 0.264237
[266]	valid_0's l2: 3.27066e-06	valid_0's rmspe: 0.26422
[267]	valid_0's l2: 3.26809e-06	valid_0's rmspe: 0.264176
[268]	valid_0's l2: 3.265e-06	valid_0's rmspe: 0.264142
[269]	valid_0's l2: 3.26335e-06	valid_0's rmspe: 0.264121
[270]	valid_0's l2: 3.26049e-06	valid_0's rmspe: 0.264103
[271]	valid_0's l2: 3.25757e-06	valid_0's rmspe: 0.26406
[272]	valid_0's l2: 3.25568e-06	valid_0's rmspe: 0.264028
[273]	valid_0's l2: 3.25564e-06	valid_0's rmspe: 0.264031
[274]	valid_0's l2: 3.25304e-06	valid_0's rmspe: 0.264003
[275]	valid_0's l2: 3.25015e-06	valid_0's rmspe: 0.26398
[276]	valid_0's l2: 3.2466e-06	valid_0's rmspe: 0.263951
[277]	valid_0's l2: 3.24403e-06	valid_0's rmspe: 0.263923
[278]	valid_0's l2: 3.24399e-06	valid_0's rmspe: 0.263926
[279]	valid_0's l2: 

[406]	valid_0's l2: 3.12938e-06	valid_0's rmspe: 0.262304
[407]	valid_0's l2: 3.12844e-06	valid_0's rmspe: 0.262284
[408]	valid_0's l2: 3.12829e-06	valid_0's rmspe: 0.262275
[409]	valid_0's l2: 3.12892e-06	valid_0's rmspe: 0.262267
[410]	valid_0's l2: 3.12886e-06	valid_0's rmspe: 0.262249
[411]	valid_0's l2: 3.12861e-06	valid_0's rmspe: 0.262243
[412]	valid_0's l2: 3.12847e-06	valid_0's rmspe: 0.262231
[413]	valid_0's l2: 3.12707e-06	valid_0's rmspe: 0.262218
[414]	valid_0's l2: 3.12588e-06	valid_0's rmspe: 0.262212
[415]	valid_0's l2: 3.12609e-06	valid_0's rmspe: 0.262209
[416]	valid_0's l2: 3.12563e-06	valid_0's rmspe: 0.2622
[417]	valid_0's l2: 3.12549e-06	valid_0's rmspe: 0.262192
[418]	valid_0's l2: 3.12449e-06	valid_0's rmspe: 0.262183
[419]	valid_0's l2: 3.12275e-06	valid_0's rmspe: 0.262166
[420]	valid_0's l2: 3.12251e-06	valid_0's rmspe: 0.26216
[421]	valid_0's l2: 3.12237e-06	valid_0's rmspe: 0.262153
[422]	valid_0's l2: 3.12222e-06	valid_0's rmspe: 0.262138
[423]	valid_0's l

[563]	valid_0's l2: 3.07883e-06	valid_0's rmspe: 0.261238
[564]	valid_0's l2: 3.07769e-06	valid_0's rmspe: 0.26124
[565]	valid_0's l2: 3.07749e-06	valid_0's rmspe: 0.261231
[566]	valid_0's l2: 3.07748e-06	valid_0's rmspe: 0.261227
[567]	valid_0's l2: 3.07752e-06	valid_0's rmspe: 0.261223
[568]	valid_0's l2: 3.07747e-06	valid_0's rmspe: 0.261213
[569]	valid_0's l2: 3.0774e-06	valid_0's rmspe: 0.261209
[570]	valid_0's l2: 3.07734e-06	valid_0's rmspe: 0.261195
[571]	valid_0's l2: 3.07706e-06	valid_0's rmspe: 0.261186
[572]	valid_0's l2: 3.077e-06	valid_0's rmspe: 0.261176
[573]	valid_0's l2: 3.07647e-06	valid_0's rmspe: 0.261156
[574]	valid_0's l2: 3.07638e-06	valid_0's rmspe: 0.261158
[575]	valid_0's l2: 3.0752e-06	valid_0's rmspe: 0.261153
[576]	valid_0's l2: 3.07516e-06	valid_0's rmspe: 0.261145
[577]	valid_0's l2: 3.07501e-06	valid_0's rmspe: 0.261141
[578]	valid_0's l2: 3.07496e-06	valid_0's rmspe: 0.261137
[579]	valid_0's l2: 3.07492e-06	valid_0's rmspe: 0.261135
[580]	valid_0's l2:

[848]	valid_0's l2: 3.02573e-06	valid_0's rmspe: 0.259983
[849]	valid_0's l2: 3.02543e-06	valid_0's rmspe: 0.259978
[850]	valid_0's l2: 3.02516e-06	valid_0's rmspe: 0.259971
[851]	valid_0's l2: 3.02528e-06	valid_0's rmspe: 0.259956
[852]	valid_0's l2: 3.02496e-06	valid_0's rmspe: 0.259959
[853]	valid_0's l2: 3.02505e-06	valid_0's rmspe: 0.259958
[854]	valid_0's l2: 3.02505e-06	valid_0's rmspe: 0.259972
[855]	valid_0's l2: 3.02491e-06	valid_0's rmspe: 0.259963
[856]	valid_0's l2: 3.02498e-06	valid_0's rmspe: 0.259961
[857]	valid_0's l2: 3.02483e-06	valid_0's rmspe: 0.259955
[858]	valid_0's l2: 3.02465e-06	valid_0's rmspe: 0.259946
[859]	valid_0's l2: 3.02484e-06	valid_0's rmspe: 0.259944
[860]	valid_0's l2: 3.02477e-06	valid_0's rmspe: 0.259943
[861]	valid_0's l2: 3.02482e-06	valid_0's rmspe: 0.259939
[862]	valid_0's l2: 3.02484e-06	valid_0's rmspe: 0.259937
[863]	valid_0's l2: 3.0248e-06	valid_0's rmspe: 0.25993
[864]	valid_0's l2: 3.02461e-06	valid_0's rmspe: 0.259927
[865]	valid_0's 

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.02, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1000, n_jobs=-1, num_leaves=7, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [106]:
rmspe(y_test, reg.predict(X_test)), rmspe(y_train, reg.predict(X_train))

(0.2594512598589408, 0.24215446511212016)