In [1]:
#optiver volatility prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 50)

In [2]:
train = pd.read_csv('/Users/ericp/OneDrive/Documents/GitHub/Optiver/train.csv')
test = pd.read_csv('/Users/ericp/OneDrive/Documents/GitHub/Optiver/test.csv')
order_book = pd.read_parquet('/Users/ericp/OneDrive/Documents/GitHub/Optiver/book_train.parquet/stock_id=0')
trade_book = pd.read_parquet('/Users/ericp/OneDrive/Documents/GitHub/Optiver/trade_train.parquet/stock_id=0')

In [3]:
train.head(10)

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747
5,0,72,0.004912
6,0,97,0.009388
7,0,103,0.00412
8,0,109,0.002182
9,0,123,0.002669


In [4]:
order_book.head(3)

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100


In [5]:
trade_book.head(3)

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count
0,5,21,1.002301,326,12
1,5,46,1.002778,128,4
2,5,50,1.002818,55,1


In [6]:
#create log diffs
def logDiff(stock_prices):
    return np.log(stock_prices).diff()

In [7]:
#create realized vols for each time / stock price
def realized_vol(log_diffs):
    return np.sqrt(np.sum(log_diffs ** 2))

In [8]:
#process the order book file
def preprocess_order(orderPath):
    stock = pd.read_parquet(orderPath)
    stock_id = orderPath.split('=')[1]
    stock['stock_id'] = stock_id
    stock['wap'] = (stock['bid_price1'] * stock['ask_size1'] + stock['ask_price1'] * stock['bid_size1']) / (stock['bid_size1'] + stock['ask_size1'])
    stock['wap2'] = (stock['bid_price2'] * stock['ask_size2'] + stock['ask_price2'] * stock['bid_size2']) / (stock['bid_size2'] + stock['ask_size2'])
    stock['logDifferences'] = stock.groupby(['time_id'])['wap'].apply(logDiff)
    stock['logDifferences2'] = stock.groupby(['time_id'])['wap2'].apply(logDiff)    
    stock['volume_imbalance1'] = stock['bid_size1'] / stock['ask_size1']
    stock['volume_imbalance2'] = stock['bid_size2'] / stock['ask_size2']
    stock['spread'] = stock['ask_price1'] - stock['bid_price1']
    stock['bid_spread'] = stock['bid_price1'] - stock['bid_price2']
    stock['ask_spread'] = stock['ask_price2'] - stock['ask_price1']
    
    
    return stock
    

In [9]:
#glob glob the two file paths with all the trade and order files
orderPath = glob.glob('/users/ericp/OneDrive/Documents/GitHub/Optiver/book_train.parquet/*')
tradePath = glob.glob('/users/ericp/OneDrive/Documents/GitHub/Optiver/trade_train.parquet/*')

In [10]:
stock = preprocess_order(orderPath[0])

In [11]:
stock.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id,wap,wap2,logDifferences,logDifferences2,volume_imbalance1,volume_imbalance2,spread,bid_spread,ask_spread
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100,0,1.001434,1.00139,,,0.013274,0.02,0.000879,5.2e-05,5.2e-05
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100,0,1.001448,1.00139,1.4e-05,0.0,0.03,0.02,0.000879,5.2e-05,5.2e-05
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100,0,1.001448,1.001391,0.0,1e-06,0.03,0.02,0.000879,5.2e-05,0.000103
3,5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100,0,1.001443,1.001391,-5e-06,0.0,0.02381,0.02,0.000879,5.2e-05,0.000103
4,5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100,0,1.001443,1.001391,0.0,0.0,0.02381,0.02,0.000879,5.2e-05,0.000103


In [None]:
trade_book.head(3)

In [None]:
#preprocess the orderbook with aggregate stats
def preprocess_order_agg(stk):
    
    agg_stats = {
        'logDifferences':[realized_vol],
        'logDifferences2':[realized_vol],
        'wap': [np.mean, np.std],
        'wap2':[np.mean, np.std],
        'volume_imbalance1':[np.mean, np.std],
        'spread':[np.mean, np.std, np.min, np.max],
        'bid_spread':[np.mean, np.std],
        'ask_spread':[np.mean, np.std]
    }
    
    df_agg = pd.DataFrame(stk.groupby(['time_id']).agg(agg_stats)).reset_index()
    df_agg.columns = ['_'.join(col) for col in df_agg.columns]
    df_agg['row_id'] = stk.stock_id.apply(str) + '-' + df_agg['time_id_'].apply(str)
    #df_agg = df_agg.drop(['time_id_'], axis = 1)
    
    return df_agg

In [None]:
agg_stats = preprocess_order_agg(stock)

In [None]:
def preprocess_trade(tradePath):
    stk = pd.read_parquet(tradePath)
    stock_id = tradePath.split('=')[1]
    stk['stock_id'] = stock_id
    
    agg_stats = {
        'price': [np.mean, np.std, np.min, np.max],
        'size':[np.sum],
        'order_count':[np.sum]
    }
    
    df_agg = pd.DataFrame(stk.groupby(['time_id']).agg(agg_stats)).reset_index()
    df_agg.columns = ['_'.join(col) for col in df_agg.columns]
    df_agg['row_id'] = stk.stock_id.apply(str) + '-' + stk['time_id'].apply(str)
    #df_agg = df_agg.drop(['time_id_'], axis = 1)
    
    return df_agg

In [None]:
agg_stats2 = preprocess_trade(tradePath[0])

In [None]:
#time stats

def time_stats(stk, time_in_seconds):
    df = pd.DataFrame()
    
    agg_stats = {
        'logDifferences':[realized_vol],
        'logDifferences2':[realized_vol],
        'wap': [np.mean, np.std],
        'wap2':[np.mean, np.std],
        'volume_imbalance1':[np.mean, np.std],
        'spread':[np.mean, np.std, np.min, np.max],
        'bid_spread':[np.mean, np.std],
        'ask_spread':[np.mean, np.std]
    }
    
    time_df = pd.DataFrame(stk.query(f'seconds_in_bucket > {time_in_seconds}').groupby(['time_id']).agg(agg_stats)).reset_index()
    time_df.columns = ['_'.join(col) for col in time_df.columns]
    time_df = time_df.add_suffix('_' + str(time_in_seconds))
    #time_df = time_df.drop([f'time_id__{time_in_seconds}'], axis = 1)
    time_df['row_id'] = stk.stock_id.apply(str) + '-' + stk['time_id'].apply(str)
            
    return time_df

In [None]:
time_stats_0 = time_stats(stock, time_in_seconds = 0)
time_stats_150 = time_stats(stock, time_in_seconds = 150)
time_stats_300 = time_stats(stock, time_in_seconds = 300)
time_stats_450 = time_stats(stock, time_in_seconds = 450)

In [None]:
#merge all dfs
time_stats_0 = time_stats_0.merge(time_stats_150, how = 'left', left_on = 'time_id__0', right_on = 'time_id__150')
time_stats_0 = time_stats_0.merge(time_stats_300, how = 'left', left_on = 'time_id__0', right_on = 'time_id__300')
time_stats_0 = time_stats_0.merge(time_stats_450, how = 'left', left_on = 'time_id__0', right_on = 'time_id__450')

In [None]:
time_stats_0.columns

In [None]:
time_stats_0.drop(['time_id__0','time_id__150', 'time_id__300', 'time_id__450', 'row_id_x', 'row_id_y', 'row_id_y'], axis = 1, inplace = True)

In [None]:
time_stats_0.columns

In [None]:
time_stats_0.shape

In [None]:
#loop through all stocks
#glob glob the two file paths with all the trade and order files
orderPath = glob.glob('/users/ericp/OneDrive/Documents/GitHub/Optiver/book_train.parquet/*')
tradePath = glob.glob('/users/ericp/OneDrive/Documents/GitHub/Optiver/trade_train.parquet/*')

i = 1
df_final = pd.DataFrame()

for (order, trade) in zip(orderPath, tradePath):
    stock = preprocess_order(order)
    trade_agg = preprocess_trade(trade)
    time_stats_0 = time_stats(stock, time_in_seconds = 0)
    time_stats_150 = time_stats(stock, time_in_seconds = 150)
    time_stats_300 = time_stats(stock, time_in_seconds = 300)
    time_stats_450 = time_stats(stock, time_in_seconds = 450)
    
    #merge all dfs
    time_stats_0 = time_stats_0.merge(time_stats_150, how = 'left', left_on = 'time_id__0', right_on = 'time_id__150')
    time_stats_0 = time_stats_0.merge(time_stats_300, how = 'left', left_on = 'time_id__0', right_on = 'time_id__300')
    time_stats_0 = time_stats_0.merge(time_stats_450, how = 'left', left_on = 'time_id__0', right_on = 'time_id__450')
    
    df = time_stats_0.merge(trade_agg, how = 'left', left_on = 'time_id__0', right_on = 'time_id_')
    
    
    df.drop(['time_id__0','time_id__150', 'time_id__300', 'time_id__450', 'time_id_','row_id_x', 'row_id_y', 'row_id_y'], axis = 1, inplace = True)
    
    df_final = pd.concat([df, df_final], axis = 0)
    
    print (i)
    i += 1

In [None]:
df_final.shape

In [None]:
df_final.head()

In [None]:
#lgb model
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split

X = df_final.drop(['logDifferences_realized_vol_0', 'row_id'], axis = 1)
y = df_final['logDifferences_realized_vol_0']

X_train, X_test, y_train, y_test = train_test_split(X,y)
print('Shape of X_test is {}'.format(X_train.shape))
print('Shape of X_train is {}'.format(X_test.shape))
print('Shape of y_test is {}'.format(y_test.shape))
print('Shape of y_train is {}'.format(y_train.shape))

In [None]:
model_lgb = lgb.LGBMRegressor()
model_lgb.fit(X_train, y_train)

In [None]:
ypreds = model_lgb.predict(X_test)

In [None]:
#create RMSPE metric
def RMSPE(vols, truth):         
    return np.sqrt(np.sum(np.mean(np.square((vols - truth)/truth))))

In [None]:
RMSPE(ypreds, y_test)

from sklearn.model_selection import GridSearchCV
params = {
    'num_leaves': [7, 14, 21, 28, 31, 50],
    'learning_rate': [0.1, 0.03, 0.003],
    'max_depth': [-1, 3, 5],
    'n_estimators': [50, 100, 200, 500],
}

grid = GridSearchCV(model_lgb, params, scoring='r2', cv = 5)
grid.fit(X_train, y_train)



In [None]:
params = grid.best_params_

In [None]:
params

In [None]:
model_lgb = lgb.LGBMRegressor(**params)
model_lgb.fit(X_train, y_train)

In [None]:
ypreds = model_lgb.predict(X_test)

In [None]:
RMSPE(ypreds, y_test)

In [None]:
#create preds for test set
#loop through all stocks
#glob glob the two file paths with all the trade and order files
orderTest = glob.glob('/users/ericp/OneDrive/Documents/GitHub/Optiver/book_test.parquet/*')
tradeTest = glob.glob('/users/ericp/OneDrive/Documents/GitHub/Optiver/trade_test.parquet/*')

f_final = pd.DataFrame()

for (order, trade) in zip(orderPath, tradePath):
    stock = preprocess_order(order)
    trade_agg = preprocess_trade(trade)
    time_stats_0 = time_stats(stock, time_in_seconds = 0)
    time_stats_150 = time_stats(stock, time_in_seconds = 150)
    time_stats_300 = time_stats(stock, time_in_seconds = 300)
    time_stats_450 = time_stats(stock, time_in_seconds = 450)
    
    #merge all dfs
    time_stats_0 = time_stats_0.merge(time_stats_150, how = 'left', left_on = 'time_id__0', right_on = 'time_id__150')
    time_stats_0 = time_stats_0.merge(time_stats_300, how = 'left', left_on = 'time_id__0', right_on = 'time_id__300')
    time_stats_0 = time_stats_0.merge(time_stats_450, how = 'left', left_on = 'time_id__0', right_on = 'time_id__450')
    
    df = time_stats_0.merge(trade_agg, how = 'left', left_on = 'time_id__0', right_on = 'time_id_')
    
    
    df.drop(['time_id__0','time_id__150', 'time_id__300', 'time_id__450', 'time_id_','row_id_x', 'row_id_y', 'row_id_y'], axis = 1, inplace = True)
    
    df_final = pd.concat([df, df_final], axis = 0)


In [None]:
X = df_final.drop(['logDifferences_realized_vol_0', 'row_id'], axis = 1)
y = df_final['logDifferences_realized_vol_0']

In [None]:
model_lgb = lgb.LGBMRegressor(**params)
model_lgb.fit(X, y)
ypreds = pd.Series(model_lgb.predict(X_test), names = 'target')

In [None]:
submission = [df_final['row_id'], preds]