In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tslearn.clustering import TimeSeriesKMeans
import os
#read file 
root = os.getcwd()
File = '/newData_Feb11.csv'
df = pd.read_csv(root+File)
#set date as index
df=df.set_index('Date')
#since we only care about trading until the end of 2021, we drop all data after that
df=df[df.index < '2022-01-01']
df.ffill(inplace= True)

display(df)

Unnamed: 0_level_0,RAY,VIX,QQQ,T10Y3M,CL1,HG1,GC1,XAG,VNQ,EURtoUSD,VXUS,EEM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1990-01-02,200.241,17.24,,0.11,,,,,,1.2146,,
1990-01-03,200.116,18.19,,0.10,,,,,,1.2096,,
1990-01-04,198.646,19.22,,0.14,,,,,,1.2287,,
1990-01-05,196.850,20.11,,0.20,,,,,,1.2358,,
1990-01-08,197.525,20.26,,0.23,,,,,,1.2452,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,2803.742,17.68,403.48,1.42,75.57,446.50,1808.1,23.0676,114.00,1.1328,63.70,48.94
2021-12-28,2798.353,17.54,401.61,1.43,75.98,442.00,1810.2,23.0200,114.42,1.1310,63.63,48.78
2021-12-29,2801.873,16.95,401.55,1.50,76.56,440.20,1805.1,22.8331,115.19,1.1349,63.57,48.56
2021-12-30,2795.908,17.33,400.35,1.47,76.99,439.25,1814.1,23.0426,115.69,1.1325,63.66,49.09


In [2]:
#split train-test
split_date = '2018-01-01'
df_train = df[df.index < split_date].copy(deep=True)


In [3]:
#ffd code
def getWeights(d,lags):
    # return the weights from the series expansion of the differencing operator
    # for real orders d and up to lags coefficients
    w=[1]
    for k in range(1,lags):
        w.append(-w[-1]*((d-k+1))/k)
    w=np.array(w).reshape(-1,1) 
    return w

    
def cutoff_find(order,cutoff,start_lags): #order is our dearest d, cutoff is 1e-5 for us, 
    #and start lags is an initial amount of lags in which the loop will start, 
    #this can be set to high values in order to speed up the algo
    val=np.inf
    lags=start_lags
    while abs(val)>cutoff:
        w=getWeights(order, lags)
        val=w[len(w)-1]
        lags+=1
    return lags


def ts_differencing_tau(series, order, tau):
    # return the time series resulting from (fractional) differencing
    lag_cutoff=(cutoff_find(order,tau,1)) #finding lag cutoff with tau
    weights=getWeights(order, lag_cutoff)
    res=0
    for k in range(lag_cutoff):
        res += weights[k]*series.shift(k).fillna(0)
    return [res[lag_cutoff:],lag_cutoff]

In [4]:
import statsmodels
from statsmodels.tsa.stattools import adfuller
#find columns in df that is already stationary
stat_col = []
for col in df_train.columns:
    first_date = df_train[col].first_valid_index()
    series = df_train[df_train.index >= first_date][col]
    adftest=adfuller(series)
    if adftest[1] < 0.05:
        stat_col.append(col)

In [5]:
best_series = []
best_d = []
lags = []
name = []
adf_stat_holder = []
possible_d=np.divide(range(1,100),100)
tau=1e-4
#remove already stationary col from training
for col in df_train.drop(columns=stat_col).columns:
    print('processing column ',col)
    first_valid = df_train[col].first_valid_index()
    name.append(col)
    for i in range(len(possible_d)):
        pd_series, lag=ts_differencing_tau(df_train[df_train.index>=first_valid][col],possible_d[i],tau)
        ad_pvalue = adfuller(pd_series)[1]
        if ad_pvalue <= 0.05:
            print('d',possible_d[i])
            best_d.append(possible_d[i])
            best_series.append(pd_series)
            lags.append(lag)
            adf_stat_holder.append(ad_pvalue)
            break


processing column  RAY
d 0.48
processing column  QQQ
d 0.44
processing column  T10Y3M
d 0.11
processing column  CL1
d 0.18
processing column  HG1
d 0.27
processing column  GC1
d 0.44
processing column  XAG
d 0.26
processing column  VNQ
d 0.2
processing column  EURtoUSD
d 0.1
processing column  VXUS
d 0.19
processing column  EEM
d 0.05


In [6]:
#transform trainset
df_train_ffd=pd.DataFrame(index=df_train.index)
for i in best_series:
    df_train_ffd = pd.concat([df_train_ffd,i],axis=1)
df_train_ffd = pd.concat([df_train_ffd,df_train[stat_col]],axis=1)

In [7]:
import statsmodels
from statsmodels.tsa.stattools import adfuller

#run adf test for initial time series from first valid
def adf_table(dataframe):
    adf = pd.DataFrame(columns=[None,'ADF Statistic','ADF p-value','Reject H0 at 5%?'])
    for i in range(len(dataframe.columns)):
        col=dataframe.columns[i]
        #first obs date
        first_date = dataframe[col].first_valid_index()
        series = dataframe[dataframe.index >= first_date][col]
        adftest=adfuller(series)
        reject = "No"
        if adftest[1] < 0.05:
            reject = "Yes"
        adf.loc[i]= [col,round(adftest[0],4),round(adftest[1],4),reject]
    adf.set_index(None,inplace= True)
    return adf
display(adf_table(df_train_ffd))

Unnamed: 0,ADF Statistic,ADF p-value,Reject H0 at 5%?
RAY,-2.8715,0.0488,Yes
QQQ,-2.884,0.0472,Yes
T10Y3M,-2.9108,0.0441,Yes
CL1,-2.9012,0.0452,Yes
HG1,-2.9332,0.0416,Yes
GC1,-2.9055,0.0447,Yes
XAG,-2.8912,0.0464,Yes
VNQ,-2.8841,0.0472,Yes
EURtoUSD,-2.9667,0.0381,Yes
VXUS,-2.9178,0.0433,Yes


In [8]:
def ts_differencing(series, order, lag_cutoff):
    # return the time series resulting from (fractional) differencing
    # for real orders order up to lag_cutoff coefficients
    weights=getWeights(order, lag_cutoff)
    res=0
    for k in range(lag_cutoff):
        res += weights[k]*series.shift(k).fillna(0)
    return res[lag_cutoff:]

In [67]:
## SPECIFY HOW OFTEN WE RETRAIN THIS
end_date = '2018-06-01'
df=df[df.index < end_date]
#transform test set
test_series = []
for i in range(len(name)):
    start_index = df.reset_index()[df.reset_index().Date == df[df.index >= split_date].index[0]].index[0]-lags[i]
    pd_series=ts_differencing(df[df.reset_index().index >= start_index][name[i]],best_d[i],lags[i])
    test_series.append(pd_series)

In [68]:
#transform testset
df_test_ffd=pd.DataFrame()
for i in test_series:
    df_test_ffd = pd.concat([df_test_ffd,i],axis=1)
df_test_ffd = pd.concat([df_test_ffd,df[df.index >= split_date][stat_col]],axis=1)

In [69]:
df_transform = pd.concat([df_train_ffd,df_test_ffd],axis =0)

In [70]:
adf_table(df_transform)

Unnamed: 0,ADF Statistic,ADF p-value,Reject H0 at 5%?
RAY,-2.9094,0.0443,Yes
QQQ,-2.4256,0.1346,No
T10Y3M,-2.9376,0.0411,Yes
CL1,-2.9182,0.0433,Yes
HG1,-3.0516,0.0303,Yes
GC1,-2.9756,0.0372,Yes
XAG,-2.9188,0.0432,Yes
VNQ,-2.9869,0.0361,Yes
EURtoUSD,-2.992,0.0356,Yes
VXUS,-3.1525,0.0229,Yes
