<font size = 6>Lévy - ARMA - GARCH Risk Model Implement and its Evaluation - Prepare Data</font>
<font size = 4><div style="text-align: right"> Contributor: Haochen Jiang</div></font>
<font size = 4><div style="text-align: right"> May 3, 2022</div></font>

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import math
import nlopt
import scipy.stats

import datetime as dt
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_datareader.data as web

from scipy import integrate
from statsmodels.tsa.stattools import adfuller, kpss

%matplotlib inline

In [3]:
def adf_test(timeseries):
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)
    
def kpss_test(timeseries):
    print ('Results of KPSS Test:')
    kpsstest = kpss(timeseries, regression='c', nlags="auto")
    kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','#Lags Used'])
    for key,value in kpsstest[3].items():
        kpss_output['Critical Value (%s)'%key] = value
    print (kpss_output)

In [5]:
path = 'D:\\Work\\'

start = dt.datetime(2014, 1, 1)
end = dt.datetime(2021, 12, 31)

# Data

## Load Stock Information Data 

<strong>Note:</strong><br>
_snp_tickers_sectors.csv<br>
It is found at https://en.wikipedia.org/wiki/List_of_S%26P_500_companies;<br>
Last change was on April 21, 2021.

In [6]:
# Load S&P 500 Sector Information
sector = pd.read_csv(path + "Data\\stocks\\" + "_snp_tickers_sectors.csv", index_col = 0)
print("Number of stocks in S&P500: %d" % sector.shape[0])
# Make some necessary changes
sector.rename(index = {"BRK.B": "BRK-B", "BF.B": "BF-B"}, inplace=True)

Number of stocks in S&P500: 504


## Download Data

In [None]:
yf.download('^GSPC', progress=False).to_csv(path + "Data\\" + "stocks\\" + "_SNP500.csv")
yf.download('^VIX', progress=False).to_csv(path + "Data\\" + "_VIX.csv")
yf.download('^VVIX', progress=False).to_csv(path + "Data\\" + "_VVIX.csv")

In [None]:
%%time
# Download Stocks Data
bad_names = []
not_enough_data_names = []
for stock in list(sector.index):
    try:       
        stock_df = web.DataReader(stock, 'yahoo', start = dt.date(2012,1,1))
        if stock_df.index[0] != dt.datetime(2012, 1, 3):
            not_enough_data_names.append(stock)
            print('Not enough data length in stock of %s.' % (stock))
            sector.drop(sector[sector.index == stock].index, inplace = True)
            print('Remove %s info from sector.' % (stock))
        else:
            stock_df['Name'] = stock
            stock_df.to_csv(path + "Data\\stocks\\" + stock + ".csv")
    except:
        bad_names.append(stock)
        print('Data Not Found in Yahoo: %s.' % (stock))

print("Stocks Data Downloading Ends.")
print("Failed stocks:")
print(bad_names)
print("Not enough Data stocks:")
print(not_enough_data_names)
print("Number of stocks now in S&P500: %d" % sector.shape[0])

## Load Data

In [7]:
sample_size = 2000

### Market Data

In [8]:
# Load S&P 500 Data
market = pd.read_csv(path + "Data\\stocks\\" + "_SNP500.csv", index_col = 0, parse_dates = True)
market["Name"] = "^GSPC"
market = market[market.index < end][-(sample_size+1):]
market['Return'] = market['Adj Close'].pct_change()
market['Log Return'] = np.log(1 + market['Return'])
market = market.dropna()
# Load VIX Data
vix = pd.read_csv(path + "Data\\" + "_VIX.csv", index_col = 0, parse_dates = True)
vix["Name"] = "^VIX"
vix = vix[vix.index < end][-(sample_size+1):]
vix['Return'] = vix['Adj Close'].pct_change()
vix['Log Return'] = np.log(1 + vix['Return'])
vix = vix.dropna()
# Load VVIX Data
vvix = pd.read_csv(path + "Data\\" + "_VVIX.csv", index_col = 0, parse_dates = True)
vvix["Name"] = "^VVIX"
vvix = vvix[vvix.index < end][-(sample_size+1):]
vvix['Return'] = vvix['Adj Close'].pct_change()
vvix['Log Return'] = np.log(1 + vvix['Return'])
vvix = vvix.dropna()

### Stocks Data

In [9]:
# Load Stocks prices Data
stock_data = {}
for stock in list(sector.index):
    try:
        stock_df = pd.read_csv(path + "Data\\stocks\\" + stock + ".csv", index_col = 0, parse_dates = True)
        stock_df = stock_df[stock_df.index < end][-(sample_size+1):]
        # add return column
        stock_df['Return'] = stock_df['Adj Close'].pct_change()
        # add log return column
        stock_df['Log Return'] = np.log(1 + stock_df['Return'])
        stock_data[stock] = stock_df.dropna()
    except:
        pass

### Volatility Indices Data

In [10]:
vxidx_data = {}
name_list = ["VXAPL", "VXAZN", "VXD", "VXGOG", "VXGS", "VXIBM"]
for idx in name_list:
    idx_df = pd.read_csv(path + "Data\\" + idx + ".csv", index_col = 0, parse_dates = True)
    idx_df["Name"] = idx
    idx_df = idx_df[idx_df.index < end][-(sample_size+1):]
    # add return column
    idx_df['Return'] = idx_df['CLOSE'].pct_change()
    # add log return column
    idx_df['Log Return'] = np.log(1 + idx_df['Return'])
    vxidx_data[idx] = idx_df.dropna()