In [33]:
import pandas as pd
import numpy as np
import plotly.express as pe
import glob
from sklearn.metrics import r2_score

In [34]:
def train_test(mode):
    # mode = "train"/"test"
    file_name = '../input/optiver-realized-volatility-prediction/' + mode + '.csv'
    return pd.read_csv(file_name)

In [35]:
train = train_test("train")
train.head()

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747


In [36]:
test = train_test("test")
test.head()

Unnamed: 0,stock_id,time_id,row_id
0,0,4,0-4
1,0,32,0-32
2,0,34,0-34


In [37]:
list_order_book_file_train = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')

In [38]:
# aggregate function
def wap2vol(price_serise):
    log_return_series = np.log(price_serise).diff()
    return np.sqrt(np.sum(log_return_series**2))

In [39]:
def rel_vol_time_id(path):
    # book: book is an order book
    book = pd.read_parquet(path) # order book for a stock id loaded
    # calculating WAP
    p1 = book["bid_price1"]
    p2 = book["ask_price1"]
    s1 = book["bid_size1"]
    s2 = book["ask_size1"]
    
    book["WAP"] = (p1*s2 + p2*s1) / (s1 + s2)
    # calculating realized volatility for each time_id
    transbook = book.groupby("time_id")["WAP"].agg(wap2vol)
    return transbook

In [40]:
%%time
# tricks: dataframe data from serices
stock_id = []
time_id = []
relvol = []
for stock in list_order_book_file_train:
    temp_stock = int(stock.split('=')[1])
    temp_realvol = rel_vol_time_id(stock)
    stock_id += [temp_stock]*temp_realvol.shape[0]
    time_id += list(temp_realvol.index)
    relvol += list(temp_realvol)

CPU times: user 3min 56s, sys: 11.6 s, total: 4min 8s
Wall time: 4min 16s


In [41]:
# histort data
past_volatility = pd.DataFrame({"stock_id": stock_id, "time_id": time_id, "volatility": relvol})
# join data
joined = train.merge(past_volatility, on= ['stock_id','time_id'], how='left')

R2 = round(r2_score(y_true = joined['target'], y_pred = joined['volatility']),3)
print(f'The R2 score of the naive prediction for training set is {R2}')

The R2 score of the naive prediction for training set is 0.628


In [42]:
joined[joined['stock_id']==0].head()

Unnamed: 0,stock_id,time_id,target,volatility
0,0,5,0.004136,0.004499
1,0,11,0.001445,0.001204
2,0,16,0.002168,0.002369
3,0,31,0.002195,0.002574
4,0,62,0.001747,0.001894


In [43]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

rmspe = rmspe(joined["target"], joined["volatility"])
print(f'The RMSPE score of the native prediciton for the training set is {rmspe}')

The RMSPE score of the native prediciton for the training set is 0.34135452432884295


In [48]:
# key point
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

def linear_training(X,y,degree):
    polyfeat = PolynomialFeatures(degree=degree)
    linreg = LinearRegression()
    x = np.array(X).reshape(-1,1)
    X_ = polyfeat.fit_transform(x)
    weights = 1/np.square(y)
    return linreg.fit(X_,np.array(y).reshape(-1,1),sample_weight=weights)


In [49]:
stock_id_train = train.stock_id.unique()
models = {}

degree = 2

for i in stock_id_train:
    temp = joined[joined['stock_id']==i]
    X = temp['volatility']
    y = temp['target']
    models[i] = linear_training(X,y,degree)


In [54]:
%%time 
order_book_test = glob.glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*')

stock_id = []
time_id = []
relvol = []
for i in order_book_test:
    # finding the stock_id
    temp_stock = int(i.split("=")[1])
    # find the realized volatility for all time_id of temp_stock
    temp_relvol = rel_vol_time_id(i)
    stock_id += [temp_stock]*temp_relvol.shape[0]
    time_id += list(temp_relvol.index)
    relvol += list(temp_relvol)
    
past_test_volatility = pd.DataFrame({"stock_id": stock_id, "time_id": time_id, "volatility": relvol})

CPU times: user 20.8 ms, sys: 60.6 ms, total: 81.4 ms
Wall time: 267 ms


In [55]:
past_test_volatility

Unnamed: 0,stock_id,time_id,volatility
0,0,4,0.000294


In [56]:
order_book_test

['../input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=0']

In [65]:
def linear_inference(models, stock_id, past_volatility, degree):
    model = models[stock_id]
    polyfeat = PolynomialFeatures(degree=degree)
    return model.predict(polyfeat.fit_transform([[past_volatility]]))[0][0]

In [66]:
submission = pd.DataFrame({"row_id" : [], "target" : []})  
submission["row_id"] = past_test_volatility.apply(lambda x: str(int(x.stock_id)) + '-' + str(int(x.time_id)), axis=1)
# prediction for test data
submission["target"] = past_test_volatility.apply(lambda x: linear_inference(models,x.stock_id,x.volatility,degree),axis = 1)

In [58]:
past_test_volatility

Unnamed: 0,stock_id,time_id,volatility
0,0,4,0.000294


In [60]:
submission

Unnamed: 0,row_id,target
0,0-4,


In [67]:
submission.to_csv('submission.csv',index = False)

Unnamed: 0,row_id,target
0,0-4,0.00068
