# Vector AutoRegressive Model

## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Plotting
import matplotlib.pyplot as plt
import seaborn as sns

## sklearn and statsmodels
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import acf, pacf, grangercausalitytests
from sklearn.pipeline import Pipeline
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss

## Import TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

## Import yahoo finance
import yfinance as yf

## Import random
import random

## Read file

In [2]:
df = pd.read_csv('../Data/dataset_others.csv')
# Extract _change_in_price columns
change_in_price_cols = [col for col in df.columns if '_change_in_price' in col]
df_change = df[change_in_price_cols].copy()
df_change

Unnamed: 0,CADUSD=X_change_in_price,GM_change_in_price,JCI_change_in_price,TM_change_in_price,TRYUSD=X_change_in_price,^IXIC_change_in_price,F_change_in_price
0,-0.000418,0.099998,0.330002,-0.569992,-0.000812,15.439453,0.010000
1,0.001932,0.230000,0.160000,-0.309998,-0.000532,57.250000,0.090000
2,-0.000607,-0.139999,-0.279999,-1.449997,0.000039,-39.709961,-0.040000
3,-0.000238,-0.119999,-0.560001,0.059998,0.000314,-97.479492,-0.050000
4,-0.001081,-0.350002,-0.480000,0.459991,0.000223,-47.350586,-0.120000
...,...,...,...,...,...,...,...
1232,-0.001776,4.799999,-0.529999,0.759995,0.000026,39.759766,0.230000
1233,0.000905,-0.810001,-0.159996,1.400009,-0.000006,-303.119141,-0.049999
1234,-0.001072,-0.199997,0.379997,-0.900009,-0.000013,138.839844,0.179999
1235,-0.000866,-0.650002,-0.120003,1.570007,-0.000016,103.119141,-0.170000


## Create ticker list

In [3]:
tickers = []
for col in df.columns:
    if '_RSI' in col:
        tickers.append(col.split('_')[0])

tickers.pop()

'F'

## Train-test set

In [4]:
# Train set for cross-validation
train = df_change.iloc[:-5].copy()
# Test set is saved for the best model
test = df_change.iloc[-5:].copy()

In [5]:
train

Unnamed: 0,CADUSD=X_change_in_price,GM_change_in_price,JCI_change_in_price,TM_change_in_price,TRYUSD=X_change_in_price,^IXIC_change_in_price,F_change_in_price
0,-0.000418,0.099998,0.330002,-0.569992,-0.000812,15.439453,0.010000
1,0.001932,0.230000,0.160000,-0.309998,-0.000532,57.250000,0.090000
2,-0.000607,-0.139999,-0.279999,-1.449997,0.000039,-39.709961,-0.040000
3,-0.000238,-0.119999,-0.560001,0.059998,0.000314,-97.479492,-0.050000
4,-0.001081,-0.350002,-0.480000,0.459991,0.000223,-47.350586,-0.120000
...,...,...,...,...,...,...,...
1227,-0.001141,-0.780003,-1.080002,-5.550003,0.000020,-187.099609,-0.080000
1228,0.001146,1.160000,-0.209999,1.949997,0.000049,51.490234,0.190000
1229,0.001377,0.370003,0.860001,-0.300003,0.000026,6.529297,0.040000
1230,-0.002145,-0.200001,0.190002,0.330002,-0.000024,115.941406,0.030001


In [6]:
def optimal_order(forecast_length = 5):
    
    MSE = {}
    acc = {}
    MSE_mean = {}
    acc_mean = {}
    MSE_order = {}
    acc_order = {}
    
    cv = TimeSeriesSplit(10, test_size=forecast_length)
    
    for i in range(len(change_in_price_cols) - 1):
        tr_set = train[[change_in_price_cols[-1], change_in_price_cols[i]]].copy()
        MSE[change_in_price_cols[i]] = [[0 for col in range(40)] for row in range(10)]
        acc[change_in_price_cols[i]] = [[0 for col in range(40)] for row in range(10)]
        scaler = StandardScaler()
        scaled_train = pd.DataFrame(scaler.fit_transform(tr_set), columns = tr_set.columns)
    
        k = 0
        for train_index, test_index in cv.split(train):
            t_tt = scaled_train.iloc[train_index]
            t_ho = scaled_train.iloc[test_index]
    
            model = VAR(t_tt)
            for j in range(1, 41):
                var_model = model.fit(j)
                pred = var_model.forecast(t_tt[-j:].values, steps=forecast_length)
                MSE[change_in_price_cols[i]][k][j - 1] = np.sqrt(mean_squared_error(t_ho['F_change_in_price'].values, pred[:, 0]))
                acc[change_in_price_cols[i]][k][j - 1] = sum(pred[:, 0]*t_ho['F_change_in_price'] > 0)/len(t_ho)
            k += 1

        MSE_mean[change_in_price_cols[i]] = []
        acc_mean[change_in_price_cols[i]] = []
        MSE_mean[change_in_price_cols[i]].append(np.mean(MSE[change_in_price_cols[i]], axis=0))
        acc_mean[change_in_price_cols[i]].append(np.mean(acc[change_in_price_cols[i]], axis=0))
        MSE_order[change_in_price_cols[i]] = (np.argmin(np.mean(MSE[change_in_price_cols[i]], axis=0)) + 1)
        acc_order[change_in_price_cols[i]] = (np.argmax(np.mean(acc[change_in_price_cols[i]], axis=0)) + 1)

    return MSE, acc, MSE_mean, MSE_order, acc_mean, acc_order

In [7]:
MSE_1, acc_1, MSE_mean_1, MSE_order_1, acc_mean_1, acc_order_1 = optimal_order(1)
MSE_2, acc_2, MSE_mean_2, MSE_order_2, acc_mean_2, acc_order_2 = optimal_order(2)
MSE_3, acc_3, MSE_mean_3, MSE_order_3, acc_mean_3, acc_order_3 = optimal_order(3)
MSE_4, acc_4, MSE_mean_4, MSE_order_4, acc_mean_4, acc_order_4 = optimal_order(4)
MSE_5, acc_5, MSE_mean_5, MSE_order_5, acc_mean_5, acc_order_5 = optimal_order(5)

In [8]:
c = ['Ticker']
for i in range(1, 6):
    s = 'n = ' + str(i)
    c.append(s)
print(c)

['Ticker', 'n = 1', 'n = 2', 'n = 3', 'n = 4', 'n = 5']


In [9]:
optimal_order = pd.DataFrame(columns=c)
for i, t in enumerate(tickers):
    optimal_order.loc[i, 'Ticker'] = t.split('_')[0]
optimal_order

Unnamed: 0,Ticker,n = 1,n = 2,n = 3,n = 4,n = 5
0,CADUSD=X,,,,,
1,GM,,,,,
2,JCI,,,,,
3,TM,,,,,
4,TRYUSD=X,,,,,
5,^IXIC,,,,,


In [10]:
for i in range(len(change_in_price_cols) - 1):
    optimal_order.loc[i, 'n = 1'] = (max(MSE_order_1[change_in_price_cols[i]], acc_order_1[change_in_price_cols[i]]))
    optimal_order.loc[i, 'n = 2'] = (max(MSE_order_2[change_in_price_cols[i]], acc_order_2[change_in_price_cols[i]]))
    optimal_order.loc[i, 'n = 3'] = (max(MSE_order_3[change_in_price_cols[i]], acc_order_3[change_in_price_cols[i]]))
    optimal_order.loc[i, 'n = 4'] = (max(MSE_order_4[change_in_price_cols[i]], acc_order_4[change_in_price_cols[i]]))
    optimal_order.loc[i, 'n = 5'] = (max(MSE_order_5[change_in_price_cols[i]], acc_order_5[change_in_price_cols[i]]))

In [11]:
optimal_order

Unnamed: 0,Ticker,n = 1,n = 2,n = 3,n = 4,n = 5
0,CADUSD=X,2,12,5,5,5
1,GM,35,36,30,6,6
2,JCI,30,29,35,35,33
3,TM,8,26,27,21,5
4,TRYUSD=X,11,39,3,3,17
5,^IXIC,26,24,3,19,35


## Apply VAR to test set

In [12]:
df_close = df[['F_Close_pred']].copy()
prediction_cols = []
for t in tickers:
    prediction_cols.append(t + '_VAR')

predictions = pd.DataFrame(columns=prediction_cols, index=test.index)
close_predictions = predictions.copy()

In [13]:
predictions

Unnamed: 0,CADUSD=X_VAR,GM_VAR,JCI_VAR,TM_VAR,TRYUSD=X_VAR,^IXIC_VAR
1232,,,,,,
1233,,,,,,
1234,,,,,,
1235,,,,,,
1236,,,,,,


In [26]:
optimal_order.loc[0, optimal_order.columns[1]]

np.int64(2)

In [46]:
train.loc[-3:].values

array([[-4.18305397e-04,  9.99984741e-02,  3.30001831e-01, ...,
        -8.12456012e-04,  1.54394531e+01,  1.00002289e-02],
       [ 1.93178654e-03,  2.29999542e-01,  1.59999847e-01, ...,
        -5.32239676e-04,  5.72500000e+01,  9.00001526e-02],
       [-6.07252121e-04, -1.39999390e-01, -2.79998779e-01, ...,
         3.92049551e-05, -3.97099609e+01, -3.99999619e-02],
       ...,
       [ 1.37692690e-03,  3.70002747e-01,  8.60000610e-01, ...,
         2.58404762e-05,  6.52929688e+00,  3.99999619e-02],
       [-2.14487314e-03, -2.00000763e-01,  1.90002441e-01, ...,
        -2.36257911e-05,  1.15941406e+02,  3.00006866e-02],
       [-2.04801559e-04, -2.50000000e-01, -2.70004272e-01, ...,
        -7.15889037e-05,  5.04589844e+01, -2.20000267e-01]])

In [43]:
pd.concat([train.loc[-1:], test.loc[1232:1234]])

Unnamed: 0,CADUSD=X_change_in_price,GM_change_in_price,JCI_change_in_price,TM_change_in_price,TRYUSD=X_change_in_price,^IXIC_change_in_price,F_change_in_price
0,-0.000418,0.099998,0.330002,-0.569992,-0.000812,15.439453,0.010000
1,0.001932,0.230000,0.160000,-0.309998,-0.000532,57.250000,0.090000
2,-0.000607,-0.139999,-0.279999,-1.449997,0.000039,-39.709961,-0.040000
3,-0.000238,-0.119999,-0.560001,0.059998,0.000314,-97.479492,-0.050000
4,-0.001081,-0.350002,-0.480000,0.459991,0.000223,-47.350586,-0.120000
...,...,...,...,...,...,...,...
1230,-0.002145,-0.200001,0.190002,0.330002,-0.000024,115.941406,0.030001
1231,-0.000205,-0.250000,-0.270004,-2.360001,-0.000072,50.458984,-0.220000
1232,-0.001776,4.799999,-0.529999,0.759995,0.000026,39.759766,0.230000
1233,0.000905,-0.810001,-0.159996,1.400009,-0.000006,-303.119141,-0.049999


In [150]:
acc = {}
res = {}
forecast_length = 4
#for i in range(1):
for i in range(len(change_in_price_cols) - 1):   
    train_final = train[[change_in_price_cols[-1], change_in_price_cols[i]]].copy()
    test_final = test[[change_in_price_cols[-1], change_in_price_cols[i]]].copy()
    scaler = StandardScaler()
    scaled_train = pd.DataFrame(scaler.fit_transform(train_final), columns = train_final.columns, index = train_final.index)
    scaled_test = pd.DataFrame(scaler.transform(test_final), columns = test_final.columns, index = test_final.index)
    model = VAR(scaled_train)
    result = model.fit(optimal_order.loc[i, optimal_order.columns[forecast_length]])
    k = result.k_ar
    first_test_index = scaled_test.index[0]
    
    input_scaled = scaled_train[-k:]
    pred_evaluate = []
    j = 0
    print(input_scaled)
    while len(pred_evaluate) < 5:
        pred_evaluate = np.concatenate((pred_evaluate, result.forecast(input_scaled[-k:].values, steps=forecast_length)[:, 0]))
        input_scaled = pd.concat([scaled_train, scaled_test.loc[first_test_index: first_test_index + (j + 1)*forecast_length - 1]])
        j += 1
        
    # If len(pred_evaluate) > 5, delete the last values until len(pred_evaluate) == 5
    while len(pred_evaluate) > 5:
        pred_evaluate = np.delete(pred_evaluate, -1)
    
    predictions.loc[:, prediction_cols[i]] = pred_evaluate
    #close_predictions.loc[:, prediction_cols[i]] = df_close.iloc[-6]['F_Close_pred'] + predictions[prediction_cols[i]].cumsum()
    acc[change_in_price_cols[i]] = sum(pred_evaluate*scaled_test['F_change_in_price'] > 0)/len(test_final)
    res[change_in_price_cols[i]] = result

[]
      F_change_in_price  CADUSD=X_change_in_price
1227          -0.238040                 -0.341182
1228           0.550306                  0.356382
1229           0.112336                  0.426796
1230           0.083141                 -0.647439
1231          -0.646813                 -0.055670
[ 0.02002761 -0.01344574 -0.04673383  0.02505045]
[ 0.02002761 -0.01344574 -0.04673383  0.02505045 -0.04465597  0.04582575
 -0.02364045  0.00545143]
[]
      F_change_in_price  GM_change_in_price
1226           0.579504            0.729006
1227          -0.238040           -0.769184
1228           0.550306            1.118143
1229           0.112336            0.349596
1230           0.083141           -0.204930
1231          -0.646813           -0.253572
[-0.09344347 -0.00196111  0.00085182 -0.03128262]
[-0.09344347 -0.00196111  0.00085182 -0.03128262 -0.45110956 -0.06793721
 -0.11540554  0.02892272]
[]
      F_change_in_price  JCI_change_in_price
1197           0.229128             1.73

In [149]:
acc

{'CADUSD=X_change_in_price': 0.4}

In [151]:
predictions

Unnamed: 0,CADUSD=X_VAR,GM_VAR,JCI_VAR,TM_VAR,TRYUSD=X_VAR,^IXIC_VAR
1232,0.020028,-0.093443,-0.135766,0.1257,-0.028086,0.1731
1233,-0.013446,-0.001961,-0.386989,0.214455,-0.011472,-0.060164
1234,-0.046734,0.000852,0.063272,0.057589,-0.016701,0.168694
1235,0.02505,-0.031283,-0.06288,-0.211292,-0.002453,0.010572
1236,-0.044656,-0.45111,0.35858,0.058109,-0.022613,0.009824


In [134]:
pred_evaluate

array([-0.10295985, -0.11957369, -0.21235282,  0.01207674, -0.09505734,
        0.00603343])

In [140]:
np.delete(pred_evaluate, 0, -1)

array([-0.11957369, -0.21235282,  0.01207674, -0.09505734,  0.00603343])

In [104]:
pred_evaluate[:, 0]

array([ 0.02002761, -0.01344574, -0.04673383,  0.02505045,  0.00435667])

In [120]:
kate_test = []
kate_test

[]

In [121]:
kate_test += [0.43234]
kate_test

[0.43234]

In [119]:
kate_test = [0.43234]
kate_test1 = [23412]
kate_test = kate_test + kate_test1
kate_test

[0.43234, 23412]

In [99]:
kate_test + pred_evaluate[:, 0]

array([ 0.04005522, -0.02689148, -0.09346766,  0.05010089,  0.00871334])

In [100]:
kate_test

array([ 0.02002761, -0.01344574, -0.04673383,  0.02505045,  0.00435667])

In [56]:
#scaled_test.loc[first_test_index: first_test_index + (2 + 1)*1 - 1]

Unnamed: 0,F_change_in_price,CADUSD=X_change_in_price
1232,0.667098,-0.534881
1233,-0.150443,0.282876
1234,0.521107,-0.320274


In [None]:
def VAR_test(forecast_length=5):
    acc = {}
    res = {}
    for i in range(len(change_in_price_cols) - 1):   
        train_final = train[[change_in_price_cols[-1], change_in_price_cols[i]]].copy()
        test_final = test[[change_in_price_cols[-1], change_in_price_cols[i]]].copy()
        scaler = StandardScaler()
        scaled_train = pd.DataFrame(scaler.fit_transform(train_final), columns = train_final.columns, index = train_final.index)
        scaled_test = pd.DataFrame(scaler.transform(test_final), columns = test_final.columns, index = test_final.index)
        model = VAR(scaled_train)
        result = model.fit(optimal_order.loc[i, 'Optimal order'])
        k = result.k_ar
        pred = result.forecast(scaled_train[-k:].values, steps=forecast_length)
        predictions.loc[:, prediction_cols[i]] = pred[:, 0]
        close_predictions.loc[:, prediction_cols[i]] = df_close.iloc[-6]['F_Close_pred'] + predictions[prediction_cols[i]].cumsum()
        acc[change_in_price_cols[i]] = sum(pred[:, 0]*scaled_test['F_change_in_price'] > 0)/len(test_final)
        res[change_in_price_cols[i]] = result
    return predictions, close_predictions, acc, res