In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pyarrow as pa
import pyarrow.parquet as pq
import glob
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/Users/ericp/OneDrive/Documents/GitHub/Optiver/train.csv')
test = pd.read_csv('/Users/ericp/OneDrive/Documents/GitHub/Optiver/test.csv')
order_book = pq.read_table('/Users/ericp/OneDrive/Documents/GitHub/Optiver/book_train.parquet/stock_id=0/').to_pandas()
trade_book = pq.read_table('/Users/ericp/OneDrive/Documents/GitHub/Optiver/trade_train.parquet/stock_id=0/').to_pandas()
order_book['stock_id'] = 0
trade_book['stock_id'] = 0

In [None]:
orderPath = glob.glob('/Users/ericp/OneDrive/Documents/GitHub/Optiver/book_train.parquet/*')
tradePath = glob.glob('/Users/ericp/OneDrive/Documents/GitHub/Optiver/trade_train.parquet/*')

In [None]:
order_book = order_book[order_book['time_id'] == 5]
trade_book = trade_book[trade_book['time_id'] == 5]

In [None]:
print('train has {} rows and {} columns'.format(train.shape[0],train.shape[1]))
print('test has {} rows and {} columns'.format(test.shape[0], test.shape[1]))
print('order_book has {} rows and {} columns'.format(order_book.shape[0], order_book.shape[1]))
print('trade_book has {} rows and {} columns'.format(trade_book.shape[0], trade_book.shape[1]))

In [None]:
train.head(25)

In [None]:
test.head()

In [None]:
order_book.head()

In [None]:
trade_book.head()

In [None]:
order_book['wt_avg'] = (order_book['bid_price1'] * order_book['ask_size1'] + order_book['ask_price1'] * 
                        order_book['bid_size1']) / (order_book['bid_size1'] + order_book['ask_size1'])

In [None]:
fig,axs = plt.subplots(1)
fig.suptitle('Stock = 0, Time_ID = 5, Volatility')
axs.plot(order_book['wt_avg'])

In [None]:
fig,axs = plt.subplots(1)
fig.suptitle('Stock = 0, Time_ID = 5, Volatility')
axs.plot(trade_book['price'])

In [None]:
def logDiff(stock_prices):
    return np.log(stock_prices).diff()

In [None]:
order_book['logDifferences'] = logDiff(order_book['wt_avg'])
order_book = order_book[order_book['logDifferences'].notnull()]
trade_book['logDifferences'] = logDiff(trade_book['price'])
trade_book = trade_book[trade_book['logDifferences'].notnull()]

In [None]:
fig, axs = plt.subplots(1)
fig.suptitle('Log Differences')
axs.plot(order_book['logDifferences'])

In [None]:
fig, axs = plt.subplots(1)
fig.suptitle('Log Differences')
axs.plot(trade_book['logDifferences'])

In [None]:
def realized_vol(log_diffs):
    return np.sqrt(np.sum(log_diffs ** 2))

In [None]:
print('Realized vol for Stock 0 over Time Period 5 order_book is: {}'.format(realized_vol(order_book['logDifferences'])))
print('Realized vol for Stock 0 over Time Period 5 trade_book is: {}'.format(realized_vol(trade_book['logDifferences'])))

In [None]:
def RMSPE(vols, truth):         
    return np.sqrt(np.sum(np.mean(np.square((vols - truth)/truth))))    

In [None]:
orderPath = glob.glob('/Users/ericp/OneDrive/Documents/GitHub/Optiver/book_train.parquet/*')
predictionColumn = 'target'

In [None]:
def orderBookVol(path, predictionColumn):
    order = pq.read_table(path).to_pandas()
    x = int(path.split('=')[1])
    order['stock_id'] = str(x)
    order['time_id'] = order['time_id'].apply(str)
    order['row_id'] = order['stock_id'] + '-' + order['time_id']
    order['wt_avg'] = (order['bid_price1'] * order['ask_size1'] + order['ask_price1'] * 
                        order['bid_size1']) / (order['bid_size1'] + order['ask_size1'])
    order['logDifferences'] = order.groupby(['time_id'])['wt_avg'].apply(logDiff)
    order = order[order['logDifferences'].notnull()]
    realized_vols =  pd.DataFrame(order.groupby(['row_id'])['logDifferences'].agg(realized_vol)).reset_index()
    realized_vols = realized_vols.rename(columns = {'logDifferences': predictionColumn})
    realized_vols = realized_vols[['row_id', 'target']]
    return realized_vols

In [None]:
#train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
#train = train[['row_id', 'target']]

In [None]:
#loop thorugh all of the books:
response = pd.DataFrame()
for file in orderPath:
    response = pd.concat([response, orderBookVol(file, predictionColumn)], axis = 0)    


In [None]:
df_joined = train.merge(response, on = ['row_id'], how = 'left')

In [None]:
df_joined.head()

In [None]:
print ('The error rate for the naive model is {}'.format(RMSPE(df_joined['target_x'], df_joined['target_y'])))

In [None]:
df_joined = df_joined[['row_id', 'target_y']]
df_joined.rename(columns = {'target_y':'target'}, inplace = True)
df_joined.to_csv('submission.csv', index = False)