In [1]:
#optiver volatility prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('/Users/ericp/OneDrive/Documents/GitHub/Optiver/train.csv')
test = pd.read_csv('/Users/ericp/OneDrive/Documents/GitHub/Optiver/test.csv')
order_book = pd.read_parquet('/Users/ericp/OneDrive/Documents/GitHub/Optiver/book_train.parquet/stock_id=0')
trade_book = pd.read_parquet('/Users/ericp/OneDrive/Documents/GitHub/Optiver/trade_train.parquet/stock_id=0')

In [3]:
train.head()

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747


In [4]:
order_book.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100
3,5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100
4,5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100


In [53]:
#preprocess a file
def preprocess(path, predictionColumn):
    #read file
    file = pd.read_parquet(path)
    #create stock_id
    stock_id = path.split('=')[1]
    
    #create top of book wt avg
    file['wt_avg1'] = (file['bid_price1'] * file['ask_size1'] + 
                                   file['ask_price1'] * file['bid_size1'])  / (file['bid_size1'] + file['ask_size1'])

    #create 2nd level wt avg
    file['wt_avg2'] = (file['bid_price2'] * file['ask_size2'] + 
                                   file['ask_price2'] * file['bid_size2']) / (file['bid_size2'] + file['ask_size2'])

    #create mid point
    file['mid_pt'] = (file['bid_price1'] + file['ask_price1']) / 2

    #create spread 
    file['spread1'] = file['ask_price1'] - file['bid_price1'] 
    file['spread2'] = file['ask_price2'] - file['bid_price2']
    file['bid_spread'] = abs(file['bid_price1'] - file['bid_price2'])
    file['ask_spread'] = abs(file['ask_price1'] - file['ask_price2'])
    
    #total volume
    file['total_volume'] = file['bid_size1'] + file['bid_size2'] + file['ask_size1'] + file['ask_size2']
    #volume imbalances
    file['volume_imbalance'] = file['bid_size1'] / file['ask_size1']
    file['volume_imbalance2'] = file['bid_size2'] / file['ask_size2']
    file['total_volume'] = file['bid_size1'] + file['bid_size2'] + file['ask_size1'] + file['ask_size2']
    
    #create logdifferences
    file['logDifferences'] = file.groupby(['time_id'])['wt_avg1'].apply(logDiff)
    file['logDifferences2'] = file.groupby(['time_id'])['wt_avg2'].apply(logDiff)
    file = file[file['logDifferences'].notnull()]
    
    #add stock_id
    file['stock_id'] = stock_id
       
    #dict for aggregate
    create_feature_dct = {
        'logDifferences':[realized_vol],
        'logDifferences2':[realized_vol],
        'wt_avg1':[np.mean],
        'wt_avg2':[np.mean],
        'spread1':[np.mean],
        'spread2':[np.mean],
        'bid_spread':[np.mean],
        'ask_spread':[np.mean],
        'volume_imbalance':[np.mean],
        'volume_imbalance2':[np.mean],
        'total_volume':[np.mean]
            }
    
    df_agg = pd.DataFrame(file.groupby(['time_id']).agg(create_feature_dct)).reset_index()
    df_agg = df_agg.rename(columns = {'logDifferences': predictionColumn})
    
    #add row_id
    df_agg['row_id'] = str(stock_id) + '-' + df_agg['time_id'].apply(str)
    
    df_agg.columns = ['time_id', 'target', 'target2', 'wt_avg1_mean', 'wt_avg2_mean', 'spread1_mean', 'spread2_mean', 'bid_spread_mean',
                     'ask_spread_mean', 'volume_imbalance_mean', 'volume_imbalance2_mean', 'total_volume_mean', 'row_id']
    
    df_agg = df_agg.drop(['time_id'], axis = 1)
    
    return df_agg

In [6]:
#create log diffs
def logDiff(stock_prices):
    return np.log(stock_prices).diff()

In [7]:
#create realized vols for each time / stock price
def realized_vol(log_diffs):
    return np.sqrt(np.sum(log_diffs ** 2))

In [54]:
import glob
orderPath = glob.glob('/Users/ericp/OneDrive/Documents/GitHub/Optiver/book_train.parquet/*')
df = preprocess(orderPath[0], 'target')

In [55]:
df.head()

Unnamed: 0,target,target2,wt_avg1_mean,wt_avg2_mean,spread1_mean,spread2_mean,bid_spread_mean,ask_spread_mean,volume_imbalance_mean,volume_imbalance2_mean,total_volume_mean,row_id
0,0.004499,0.006999,1.003733,1.003668,0.000855,0.001182,0.000176,0.000151,12.256744,10.161549,323.471761,0-5
1,0.001204,0.002476,1.00024,1.000208,0.000393,0.00067,0.000142,0.000135,30.084782,7.376954,411.733668,0-11
2,0.002369,0.004801,0.999541,0.999682,0.000724,0.001118,0.000196,0.000198,3.572455,13.897751,416.31016,0-16
3,0.002574,0.003637,0.998819,0.998624,0.000861,0.001161,0.000191,0.000109,10.905228,1.06028,433.12605,0-31
4,0.001894,0.003257,0.999616,0.999624,0.000396,0.000695,0.00019,0.000109,21.260113,19.871256,344.457143,0-62
