In [1]:
from random import gauss
import os 
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import yfinance as yf
sys.path.insert(2,'..')
import functions
import xgboost as xgb
from sklearn import preprocessing
import annualized_rv as arv
from sklearn.model_selection import TimeSeriesSplit

In [2]:
aaplHistIV = pd.read_pickle('historicImpliedVolData/aapl_mean_iv_2017_2022.pkl')
googHistIV = pd.read_pickle('historicImpliedVolData/goog_mean_iv_2017_2022.pkl')
msftHistIV = pd.read_pickle('historicImpliedVolData/msft_mean_iv_2017_2022.pkl')
ndxHistIV = pd.read_pickle('historicImpliedVolData/ndx_mean_iv_2017_2022.pkl')
spyHistIV = pd.read_pickle('historicImpliedVolData/spc_mean_iv_2017_2022.pkl')


In [3]:
spyHistIV.rename(columns = {'date':'Date'},inplace = True) #renaming date column to Date for consistency
spyHistIV.set_index('Date',inplace = True,drop = True) #setting index to date

In [4]:
#grab spy  from yfinance

spyHistory = yf.download('^GSPC', start='2016-01-01', end='2023-12-31')
#calculate realised vol
window =21 #realisedVol window size
spyHistory['Daily Return'] = spyHistory['Adj Close'].pct_change()
spyHistory['21dRealisedVol'] = spyHistory['Daily Return'].rolling(window=window).std() * np.sqrt(252)


[*********************100%%**********************]  1 of 1 completed


In [5]:
historicIVSeries = spyHistIV['average_iv']
historicVolumeSeries = spyHistory['Volume'].rolling(21).mean()['2017':'2021']
dailyReturnSeries= spyHistory['Daily Return']['2017':'2021']
dailyRealisedVolSeries = spyHistory['21dRealisedVol']['2017':'2021']
df_combined = pd.concat([historicIVSeries,dailyReturnSeries,dailyRealisedVolSeries,historicVolumeSeries], axis=1)

# Scale the data
scaler = preprocessing.StandardScaler().fit(df_combined)
scaled_data = scaler.transform(df_combined)

In [6]:
data = scaled_data
seq_length = 60
n_features = data.shape[1]


In [7]:
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length - 3):
        x = data[i:(i + seq_length)]
        y = data[(i + seq_length), 0]  # Next 3-day IV
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

In [8]:
X, y = create_sequences(data, seq_length)

tscv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]



In [9]:
#reshape data for xgboost
X_train = X_train.reshape(X_train.shape[0],X_train.shape[1]*X_train.shape[2])
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1]*X_test.shape[2])

In [10]:
model = xgb.XGBRegressor(objective ='reg:squarederror',n_estimators=1000)
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)

In [13]:
y_pred

array([ 0.5499564 ,  0.30972934,  0.5436868 ,  0.30560675,  0.22969171,
        0.40880948,  0.36249977,  0.23899283,  0.1741341 ,  0.31478414,
        0.32234573,  0.25511393, -0.03033435,  0.0953008 ,  0.15132159,
       -0.01984316,  0.09859911, -0.10329391,  0.12494064,  0.2165239 ,
        0.33171245, -0.04288733, -0.16012287,  0.20149723,  0.40275946,
        0.12044147,  0.41858822,  0.03710732,  0.24019009,  0.04285027,
        0.08807779, -0.00777063,  0.14660257,  0.04028238,  0.3158875 ,
        0.07501244,  0.07531745, -0.31077853,  0.20320861,  0.35192817,
        0.67703456,  0.61485887,  0.14757743,  0.21285968,  0.42685384,
        0.37977162,  0.11758885,  0.03909099, -0.01749022,  0.13482945,
       -0.11454149, -0.30734783, -0.2870999 ,  0.05806417, -0.22208479,
       -0.09457771, -0.23742753, -0.2465754 , -0.21144281, -0.18820117,
       -0.31305948, -0.34878802, -0.24473625, -0.21787585, -0.18826783,
       -0.33375916, -0.32139912, -0.0765521 , -0.17119212, -0.32