In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pyarrow as pa
import pyarrow.parquet as pq
import glob
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('/Users/ericp/OneDrive/Documents/GitHub/Optiver/train.csv')
test = pd.read_csv('/Users/ericp/OneDrive/Documents/GitHub/Optiver/test.csv')
book_test = pq.read_table('/Users/ericp/OneDrive/Documents/GitHub/Optiver/book_test.parquet/stock_id=0/stock_zero.parquet').to_pandas()
trade_test = pq.read_table('/Users/ericp/OneDrive/Documents/GitHub/Optiver/trade_test.parquet/stock_id=0/stock_zero.parquet').to_pandas()

In [3]:
order_book_train = glob.glob('/Users/ericp/OneDrive/Documents/GitHub/Optiver/book_train.parquet/*')
trade_book_train = glob.glob('/Users/ericp/OneDrive/Documents/GitHub/Optiver/trade_train.parquet/*')

In [None]:
order_book = []
for book in order_book_train:
    lst = pq.read_table(book).to_pandas()
    x = book.split('=')[1]
    lst['stock_id'] = x
    order_book.append(lst)
order_book = pd.concat(order_book, ignore_index = False)
order_book['stock_id'] = pd.to_numeric(order_book['stock_id'])

In [None]:
trade_book = []
for book in trade_book_train:
    lst = pq.read_table(book).to_pandas()
    x = book.split('=')[1]
    lst['stock_id'] = x
    trade_book.append(lst)
trade_book = pd.concat(trade_book, ignore_index = False)
trade_book['stock_id'] = pd.to_numeric(trade_book['stock_id'])

In [None]:
print('train has {} rows and {} columns'.format(train.shape[0],train.shape[1]))
print('test has {} rows and {} columns'.format(test.shape[0], test.shape[1]))
print('order_book has {} rows and {} columns'.format(order_book.shape[0], order_book.shape[1]))
print('trade_book has {} rows and {} columns'.format(trade_book.shape[0], order_book.shape[1]))
print('book_test has {} rows and {} columns'.format(book_test.shape[0], book_test.shape[1]))
print('trade_test has {} rows and {} columns'.format(trade_test.shape[0], trade_test.shape[1]))

In [None]:
train.head()

In [None]:
order_book['wt_avg'] = (order_book['bid_price1'] * order_book['ask_size1'] + order_book['ask_price1'] * 
                        order_book['bid_size1']) / (order_book['bid_size1'] + order_book['ask_size1'])

In [None]:
mean_price = pd.DataFrame(order_book.groupby(['time_id', 'stock_id'])['wt_avg'].mean())

In [None]:
mean_price.head()

In [None]:
trade_book.head()

In [None]:
mean_ex = pd.DataFrame(trade_book.groupby(['time_id', 'stock_id'])['price'].mean())

In [None]:
test.head()

In [None]:
order_book.head()

In [None]:
train_df = order_book[order_book['time_id'] == 5]
train_df = train_df[train_df['stock_id'] == 0]
train_exe_df = trade_book[trade_book['time_id'] == 5]
train_exe_df = train_exe_df[train_exe_df['time_id'] == 5]

In [None]:
fig,axs = plt.subplots(1)
fig.suptitle('Stock = 0, Time_ID = 5, Volatility')
axs.plot(train_df['wt_avg'])

In [None]:
fig,axs = plt.subplots(1)
fig.suptitle('Stock = 0, Time_ID = 5, Volatility')
axs.plot(train_exe_df['price'])

In [None]:
train_df.head()

In [None]:
def vol_calc_order(stock_prices):
    return np.log(stock_prices).diff()

In [None]:
train_df['log_diff'] = vol_calc_order(train_df['wt_avg'])
train_df = train_df[train_df['log_diff'].notnull()]
train_exe_df['log_diff'] = vol_calc_order(train_exe_df['price'])
train_exe_df = train_exe_df[train_exe_df['log_diff'].notnull()]

In [None]:
fig, axs = plt.subplots(1)
fig.suptitle('Log Differences')
axs.plot(train_df['log_diff'])

In [None]:
fig, axs = plt.subplots(1)
fig.suptitle('Log Differences')
axs.plot(train_exe_df['log_diff'])

In [None]:
def realized_vol(log_diffs):
    return np.sqrt(np.sum(log_diffs ** 2))

In [None]:
print('Realized vol for Stock 0 over Time Period 5 is: {}'.format(realized_vol(train_df['log_diff'])))
print('Realized vol for Stock 0 over Time Period 5 exe is: {}0'.format(realized_vol(train_exe_df['log_diff'])))

In [None]:
def RMSPE(vols):
    return np.sqrt((np.sum((vols.iloc[:,0] - vols.iloc[:,1])**2))/len(vols))