In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.graphics.tsaplots as sgt
import statsmodels.tsa.stattools as sts
from statsmodels.tsa.arima_model import ARIMA
from scipy.stats.distributions import chi2
import yfinance

from math import sqrt
sns.set()

### Loading the data

In [3]:
raw_data = yfinance.download(tickers="^GSPC ^FTSE ^N225 ^GDAXI", start="1994-01-07", end="2018-01-25",
                            interval="1d", group_by="ticker", auto_adjust=True, treads=True)

[*********************100%***********************]  4 of 4 completed


In [4]:
df_comp = raw_data.copy()

In [5]:
df_comp.head()

Unnamed: 0_level_0,^GSPC,^GSPC,^GSPC,^GSPC,^GSPC,^N225,^N225,^N225,^N225,^N225,^FTSE,^FTSE,^FTSE,^FTSE,^FTSE,^GDAXI,^GDAXI,^GDAXI,^GDAXI,^GDAXI
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
1994-01-06,467.549988,469.0,467.019989,467.119995,365960000.0,,,,,,3355.300049,3407.699951,3355.300049,3403.0,0.0,2228.090088,2228.48999,2206.459961,2220.629883,0.0
1994-01-07,467.089996,470.26001,467.029999,469.899994,324920000.0,17842.980469,18131.410156,17787.480469,18124.009766,0.0,3401.399902,3446.800049,3398.699951,3446.0,0.0,2218.959961,2227.639893,2201.820068,2224.949951,0.0
1994-01-10,469.899994,475.269989,469.549988,475.269989,319490000.0,18186.519531,18567.060547,18186.519531,18443.439453,0.0,3465.699951,3468.100098,3430.0,3440.600098,0.0,2231.840088,2238.01001,2222.0,2225.0,0.0
1994-01-11,475.269989,475.279999,473.269989,474.130005,305490000.0,18481.849609,18671.669922,18373.039062,18485.25,0.0,3442.5,3442.5,3413.5,3413.800049,0.0,2225.429932,2235.610107,2225.179932,2228.100098,0.0
1994-01-12,474.130005,475.059998,472.140015,474.170013,310690000.0,18447.339844,18807.080078,18301.929688,18793.880859,0.0,3394.800049,3402.399902,3372.0,3372.0,0.0,2227.120117,2227.790039,2182.060059,2182.060059,0.0


In [8]:
df_comp['spx'] = df_comp['^GSPC'].Close[:]
df_comp['dax'] = df_comp['^GDAXI'].Close[:]
df_comp['ftse'] = df_comp['^FTSE'].Close[:]
df_comp['nikkei'] = df_comp['^N225'].Close[:]


In [11]:
df_comp = df_comp.iloc[1:]
del df_comp['^GSPC']
del df_comp['^GDAXI']
del df_comp['^FTSE']
del df_comp['^N225']

df_comp = df_comp.asfreq('b')
df_comp = df_comp.fillna(method='ffill')



### Creating Returns

In [13]:
df_comp['ret_spx'] =df_comp['spx'].pct_change(1)*100
df_comp['ret_ftse'] =df_comp['ftse'].pct_change(1)*100
df_comp['ret_dax'] =df_comp['dax'].pct_change(1)*100
df_comp['ret_nikkei'] =df_comp['nikkei'].pct_change(1)*100


### Splitting the data

In [14]:
size = int(len(df_comp)*0.8)
df = df_comp.iloc[:size]
df_test = df_comp.iloc[size:]

### Fitting the Model

In [15]:
from pmdarima.arima import auto_arima

In [17]:
## the default given by the model package is not always optimal or resonable

In [16]:
model_auto = auto_arima(df_comp.ret_ftse[1:])

In [18]:
model_auto



In [19]:
model_auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,6273.0
Model:,"SARIMAX(4, 0, 5)",Log Likelihood,-9553.097
Date:,"Wed, 20 Jan 2021",AIC,19128.193
Time:,14:22:59,BIC,19202.377
Sample:,0,HQIC,19153.897
,- 6273,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.0270,0.020,1.347,0.178,-0.012,0.066
ar.L1,-0.0124,0.078,-0.159,0.874,-0.165,0.140
ar.L2,-0.5613,0.077,-7.271,0.000,-0.713,-0.410
ar.L3,-0.2105,0.069,-3.058,0.002,-0.345,-0.076
ar.L4,0.2798,0.075,3.741,0.000,0.133,0.426
ma.L1,-0.0063,0.078,-0.081,0.935,-0.159,0.146
ma.L2,0.5129,0.078,6.600,0.000,0.361,0.665
ma.L3,0.1320,0.067,1.966,0.049,0.000,0.264
ma.L4,-0.2819,0.074,-3.826,0.000,-0.426,-0.137

0,1,2,3
Ljung-Box (Q):,70.57,Jarque-Bera (JB):,8753.92
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,0.87,Skew:,-0.18
Prob(H) (two-sided):,0.0,Kurtosis:,8.78


In [20]:
## suggest this is ARMA(4,5)

## sample goes from 0 - 6273 # thus we re using all the elements

## AUTO_ARIMA automatically splits the data, validate a model choice and then return the results 

In [21]:
## comments:

## while performing manual analysis, ARMA(4,5) was not the preferred model of choice --we disregarded it as some of its
## coeff were not signoficant as we can see in above tabel as well 

## auto_arima only considers the single features the AIC -- THE lower the AIC the better the fit regeardless of sig
## of some coefficients -- drawback of auto_arima

## however empirical reasearch has sometime shown that omitting certain lags can be beneficial in the model estimation
## when clustering is apparent 

## we could have easily overfitted while going through the models in previous section exampple
## thus we should be open-minded ablut the result of the methods

## The default argument of the method restrict the number of AR and MA compenet we are willing to include --
## we might be simply excluding the model we considered best while fitting the model in the previous section



6274

In [None]:
model_auto = auto_arima(df_comp.ret_ftse[1:],exogenous = df_comp[['ret_spx','ret_dax','ret_nikkei']][1:],m=5,
                       max_order=None, max_p=7, max_q = 7, max_d=2, max_P=4, max_Q=4,max_D = 2, maxiter=50,
                       alpha=0.05, n_jobs=-1, trend='ct', information_criterion='oob', 
                        out_of_sample_size=int(len(df_comp)*0.2))

In [None]:
## fewer non-seasonal lags than the default counterpart 
## contains exog var and seasonal componet thus diff than original best fit 
## drift coeff-- linear trend coefficient + intercept ---- > ct 
## AIC value has also gone down