In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
import os
import re
stock_ids = sorted(
    [int(re.sub('stock_id=', '', x)) for x in os.listdir('../book_train.parquet')])

In [3]:
train_targets = pd.read_csv("../train.csv")

In [4]:
stock_id = stock_ids[0]
book = pd.read_parquet('../book_train.parquet/stock_id=' + str(stock_id))
trades = pd.read_parquet('../trade_train.parquet/stock_id=' + str(stock_id))

In [5]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [6]:
def add_wap1(book):
    book['wap1'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (
              book['ask_size1'] + book['bid_size1'])

def add_wap2(book):
    book['wap2'] = (book['bid_price2'] * book['ask_size2'] + book['ask_price2'] * book['bid_size2']) / (
              book['ask_size2'] + book['bid_size2'])

def add_log_return1(book):
    book['log_price1'] = np.log(book['wap1'])
    book['log_return1'] = book.groupby(['time_id'])['log_price1'].diff()
    
def add_log_return2(book):
    book['log_price2'] = np.log(book['wap2'])
    book['log_return2'] = book.groupby(['time_id'])['log_price2'].diff()
    
def get_vol1(book):
    return book.groupby(['time_id'])[['log_return1']].apply(lambda x: np.sum(x**2)**0.5).rename(
    {'log_return1': 'vol1'})
def get_vol2(book):
    return book.groupby(['time_id'])[['log_return2']].apply(lambda x: np.sum(x**2)**0.5).rename(
    {'log_return2': 'vol2'})

In [7]:
def collect_vols(stock_ids, folder):
    vols = []
    for stock_id in tqdm(stock_ids):
        book = pd.read_parquet(folder + 'stock_id=' + str(stock_id))
        book = book[book.seconds_in_bucket > 300]
        add_wap1(book)
        add_wap2(book)
        add_log_return1(book)
        add_log_return2(book)
        vols_temp = pd.concat([get_vol1(book), get_vol2(book)], axis=1).reset_index()
        vols_temp['stock_id'] = stock_id
        vols.append(vols_temp)
    vols = pd.concat(vols)
    return vols

In [8]:
trades['snb_diff'] = trades.groupby('time_id')['seconds_in_bucket'].diff()

In [9]:
def collect_trades_stats(stock_ids, folder):
    trades_stats = []
    for stock_id in tqdm(stock_ids):
        trades = pd.read_parquet(folder + 'stock_id=' + str(stock_id))
        trades['snb_diff'] = trades.groupby('time_id')['seconds_in_bucket'].diff()
        #trades = trades[trades['seconds_in_bucket'] > 300]
        stats = trades.groupby(['time_id']).agg({
            'order_count': sum, 'size': sum, 'snb_diff': np.mean}).reset_index()
        stats['stock_id'] = stock_id
        trades_stats.append(stats)
        
    return pd.concat(trades_stats)

In [10]:
vols = collect_vols(stock_ids, "../book_train.parquet/")

100%|██████████| 112/112 [11:42<00:00,  6.27s/it]


In [11]:
trades_stats = collect_trades_stats(stock_ids, '../trade_train.parquet/')

100%|██████████| 112/112 [01:16<00:00,  1.46it/s]


In [12]:
merged = pd.merge(vols, train_targets, on=['time_id', 'stock_id'], how='inner')
merged = pd.merge(merged, trades_stats, on=['time_id', 'stock_id'], how='left')

In [17]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [43]:
from sklearn.metrics import make_scorer

my_scorer = make_scorer(rmspe, greater_is_better=False)
def lgb_scorer(y_true, y_pred, weights):
    return 'rmspe', rmspe(y_true, y_pred), False

In [135]:
res = {}
for stock_id in tqdm(stock_ids):
    cols = ['log_return1', 'log_return2', 'order_count','size', 'snb_diff']
    X_train, X_test, y_train, y_test = train_test_split(
        merged[merged.stock_id == stock_id][cols], merged[merged.stock_id == stock_id]['target'])
    
    clf = lgb.LGBMRegressor(num_leaves=7, learning_rate=0.05, n_estimators=100)
    clf.fit(X_train, y_train, sample_weight=1 / np.square(y_train))
    
    res[stock_id] = ((clf, X_train, X_test, y_train, y_test))


  0%|          | 0/112 [00:00<?, ?it/s][A
  2%|▏         | 2/112 [00:00<00:08, 13.51it/s][A
  4%|▎         | 4/112 [00:00<00:07, 13.71it/s][A
  5%|▌         | 6/112 [00:00<00:07, 13.78it/s][A
  7%|▋         | 8/112 [00:00<00:07, 13.63it/s][A
  9%|▉         | 10/112 [00:00<00:08, 12.14it/s][A
 11%|█         | 12/112 [00:00<00:07, 12.50it/s][A
 12%|█▎        | 14/112 [00:01<00:08, 11.98it/s][A
 14%|█▍        | 16/112 [00:01<00:08, 11.21it/s][A
 16%|█▌        | 18/112 [00:01<00:08, 10.48it/s][A
 18%|█▊        | 20/112 [00:01<00:08, 11.22it/s][A
 20%|█▉        | 22/112 [00:01<00:07, 11.91it/s][A
 21%|██▏       | 24/112 [00:02<00:07, 11.28it/s][A
 23%|██▎       | 26/112 [00:02<00:07, 11.71it/s][A
 25%|██▌       | 28/112 [00:02<00:07, 11.90it/s][A
 27%|██▋       | 30/112 [00:02<00:06, 12.06it/s][A
 29%|██▊       | 32/112 [00:02<00:06, 12.35it/s][A
 30%|███       | 34/112 [00:02<00:06, 12.83it/s][A
 32%|███▏      | 36/112 [00:02<00:05, 12.92it/s][A
 34%|███▍      | 38/112 

In [140]:
test_answers = []
test_true = []
for stock_id in res:
    reg = res[stock_id][0]
    X_test = res[stock_id][2]
    y_test = res[stock_id][4]
    test_answers.append(reg.predict(X_test))
    test_true.append(y_test)

In [137]:
train_answers = []
train_true = []
for stock_id in res:
    reg = res[stock_id][0]
    X_train = res[stock_id][1]
    y_train = res[stock_id][3]
    test_answers.append(reg.predict(X_train))
    test_true.append(y_train)

In [141]:
y_true = np.concatenate(test_true)
y_pred = np.concatenate(test_answers)

In [142]:
rmspe(y_true, y_pred)

0.26197571718962925

In [143]:
from sklearn.ensemble import RandomForestRegressor

In [145]:
cols = ['log_return1', 'log_return2', 'order_count','size', 'snb_diff']
X_train, X_test, y_train, y_test = train_test_split(
    merged[merged.stock_id == stock_id][cols], merged[merged.stock_id == stock_id]['target'])

clf = RandomForestRegressor(n_estimators=20, max_depth=5)
clf.fit(X_train, y_train, sample_weight=1 / np.square(y_train))

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [162]:
clf = RandomForestRegressor(n_estimators=100, max_depth=4)
clf.fit(X_train, y_train, sample_weight=1 / np.square(y_train))
rmspe(y_test, clf.predict(X_test))

0.25126617264369977

In [164]:
clf = lgb.LGBMRegressor(max_depth=3, learning_rate=0.05, n_estimators=300)
clf.fit(X_train, y_train, sample_weight=1 / np.square(y_train))
rmspe(y_test, clf.predict(X_test))

0.25208127188004

In [None]:
rmspe(X_train)