In [23]:
import pandas as pd
import numpy as np
from numpy import concatenate
import itertools
import re
from math import sqrt
import os
import warnings
import pickle

import matplotlib.pyplot as plt
from matplotlib import pyplot
import matplotlib

import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller,grangercausalitytests

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM,Dropout, RNN
from keras_tuner import BayesianOptimization, RandomSearch, GridSearch
import keras

In [2]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=False):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

def adf_test(df):
    result = adfuller(df.values)
    print('ADF Statistics: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
    return result[1]

def check_stationarity(dataset):
    stationary_fields=[]
    non_stationary_fields=[]
    for col in dataset.columns:
        print('______%s_______'%col)
        p_value=adf_test(dataset[col])
        if p_value<=0.05:
            stationary_fields.append(col)
        else:
            non_stationary_fields.append(col)

    print('Number of stationary fields :%s'%len(stationary_fields))
    print('Number of non stationary fields :%s'%len(non_stationary_fields))
    
    return stationary_fields,non_stationary_fields

def feature_selection(reframed_dataset,number_of_lags,target_index, keep_granger_causality,keep_all_lags):
    col_pattern=r't-%s'%(number_of_lags)
    target_features=['var%s(t-%s)'%(target_index,i) for i in range(1,number_of_lags)]
    target=['var%s(t)'%(target_index)]
    
    if keep_all_lags==False:
        tlag_col=[col for col in reframed_dataset.columns if re.search(col_pattern,col,re.I)!=None]
        reframed_dataset_final=reframed_dataset[tlag_col+target_features+target]
    else:
        tlag_col=[col for col in reframed_dataset.columns if re.search(r'\(t\)',col,re.I)==None]
        reframed_dataset_final=reframed_dataset[tlag_col+target]
    
    if keep_granger_causality==True:
        reframed_dataset_final_transformed = reframed_dataset_final.dropna()
        i=1
        stationary_fields,non_stationary_fields=check_stationarity(reframed_dataset_final_transformed)
        
        while(len(non_stationary_fields)>0):
            print('Taking %sth difference'%(i))
            reframed_dataset_final_transformed = reframed_dataset_final_transformed.diff().dropna()
            stationary_fields,non_stationary_fields=check_stationarity(reframed_dataset_final_transformed)
            i=i+1
            

        correlated_col=[]
        for col in [col for col in reframed_dataset_final_transformed.columns if col!='var%s(t)'%(target_index)]:
            granger_ = grangercausalitytests(reframed_dataset_final_transformed[['var%s(t)'%(target_index), col]], 4)
            if (granger_[1][0]['ssr_chi2test'][1]<=0.05)|(granger_[2][0]['ssr_chi2test'][1]<=0.05)|(granger_[3][0]['ssr_chi2test'][1]<=0.05)|(granger_[4][0]['ssr_chi2test'][1]<=0.05):
                correlated_col.append(col)
    else:
        correlated_col=[col for col in reframed_dataset_final.columns if col!='var%s(t)'%(target_index)]
    
    reframed_dataset_final_features=reframed_dataset_final[correlated_col+target].copy()
    reframed_dataset_final_features.dropna(inplace=True)
    return reframed_dataset_final_features


def scale_and_split(reframed_dataset_final_features):
    scaler=MinMaxScaler()
    reframed_dataset_final_scaled=scaler.fit_transform(reframed_dataset_final_features)
    reframed_dataset_final_scaled=pd.DataFrame(reframed_dataset_final_scaled,columns=reframed_dataset_final_features.columns)

    reframed_dataset_final_scaled.index=reframed_dataset_final_features.index
    
    reframed_dataset_final_scaled_values=reframed_dataset_final_scaled.values
    train_limit=reframed_dataset_final_scaled_values.shape[0]-12

    train_X=reframed_dataset_final_scaled_values[0:train_limit,:-1]
    train_Y=reframed_dataset_final_scaled_values[0:train_limit,-1]

    test_X=reframed_dataset_final_scaled_values[train_limit:,:-1]
    test_Y=reframed_dataset_final_scaled_values[train_limit:,-1]
    
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
    print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)
    
    return train_X, train_Y, test_X, test_Y,scaler

def hyperparameter_tuning(train_X, train_Y, test_X, test_Y, country):
    
    def build_model(hp):
        model = Sequential()

        model.add(LSTM(hp.Int('input_unit',min_value=10,max_value=500,step=1),return_sequences=True, 
                       input_shape=(train_X.shape[1],train_X.shape[2])))
        for i in range(hp.Int('n_layers', 1, 4)):
            model.add(LSTM(hp.Int(f'lstm_{i}_units',min_value=10,max_value=500,step=1),return_sequences=True))
        model.add(LSTM(hp.Int('layer_2_neurons',min_value=10,max_value=500,step=1)))

        model.add(Dropout(hp.Float('Dropout_rate',min_value=0,max_value=0.99,step=0.1)))

        model.add(Dense(1, activation=hp.Choice('dense_activation',values=['sigmoid','relu','tanh','linear','selu','elu'],
                                                default='relu')))

        model.compile(loss='mean_squared_error', 
                      optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),metrics =['mse'])
        return model
    
    tuner= GridSearch(
        build_model,
        objective='mse',
        max_trials=10,
        executions_per_trial=1,
        overwrite=True,
        project_name='keras_tuning_%s_LSTM'%(country)
        )

    tuner.search(
            x=train_X,
            y=train_Y,
            epochs=100,
            batch_size=72,
            validation_data=(test_X,test_Y),
    )
    
    best_model = tuner.get_best_models(num_models=1)[0]
    
    return best_model

def rmse_calculator(best_model,test_X, test_Y,scaler):
    
    # make a prediction
    yhat = best_model.predict(test_X)

    # invert scaling for forecast
    test_X_reshaped = test_X.reshape((test_X.shape[0], test_X.shape[2]))
    inv_yhat = concatenate((test_X_reshaped, yhat), axis=1)
    inv_yhat = scaler.inverse_transform(inv_yhat)
    inv_yhat = inv_yhat[:,-1]

    # invert scaling for actual
    test_Y_reshaped = test_Y.reshape((len(test_Y), 1))
    inv_y = concatenate((test_X_reshaped,test_Y_reshaped), axis=1)
    inv_y = scaler.inverse_transform(inv_y)
    inv_y = inv_y[:,-1]
    
    rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
    print('Original Values: %s'%inv_y)
    print('Predicted Values: %s'%inv_yhat)
    
    return rmse

In [3]:
def main(mrd_directory, number_of_lags, target_index, country, keep_granger_causality, keep_all_lags):
    df_MRD=pd.read_excel(mrd_directory)
    df_MRD['Time']=pd.to_datetime(df_MRD['Time'],format='%Y-%m')
    df_MRD=df_MRD.set_index('Time')

    reframed_dataset = series_to_supervised(df_MRD, number_of_lags, 1)
    
    reframed_dataset_final_features=feature_selection(reframed_dataset,number_of_lags,target_index,keep_granger_causality,keep_all_lags)
    
    train_X, train_Y, test_X, test_Y,scaler=scale_and_split(reframed_dataset_final_features)
    
    best_model=hyperparameter_tuning(train_X, train_Y, test_X, test_Y, country)
    
    rmse=rmse_calculator(best_model,test_X, test_Y,scaler)
    
    return best_model,scaler,rmse,str(reframed_dataset.shape),str(reframed_dataset_final_features.shape)

# EXPERIMENT-1

In [4]:
canada_model, canada_scaler, canada_test_rmse, canada_pre_shape, canada_post_shape=main(r'Datasets\Canada_LSTM_MRD.xlsx',5,1,'Canada',True,False)

print(canada_test_rmse)

Trial 10 Complete [00h 00m 30s]
mse: 0.0013423407217487693

Best mse So Far: 0.0008916739607229829
Total elapsed time: 00h 06m 42s
INFO:tensorflow:Oracle triggered exit
Original Values: [5.3 5.2 4.9 4.9 5.3 5.2 5.2 5.1 5.  5.  5.  5. ]
Predicted Values: [5.49403079 5.46846077 5.48195317 5.3382588  5.37924559 5.84874704
 5.77051603 5.80913804 5.68498523 5.61344274 5.63813771 5.66663379]
0.5471985008357692


In [5]:
france_model, france_scaler, france_test_rmse, france_pre_shape, france_post_shape=main(r'Datasets\France_LSTM_MRD.xlsx',5,1,'France',True,False)

print(france_test_rmse)

Trial 10 Complete [00h 00m 35s]
mse: 0.0006314384518191218

Best mse So Far: 0.0006314384518191218
Total elapsed time: 00h 05m 30s
INFO:tensorflow:Oracle triggered exit
Original Values: [7.4 7.5 7.5 7.5 7.3 7.2 7.1 7.2 7.2 7.2 7.1 7. ]
Predicted Values: [7.33389376 7.40188015 7.4877393  7.54327795 7.55448675 7.55587878
 7.51877462 7.4807671  7.56534937 7.59535189 7.63327245 7.57986614]
0.33813753900368465


In [6]:
germany_model, germany_scaler, germany_test_rmse, germany_pre_shape, germany_post_shape=main(r'Datasets\Germany_LSTM_MRD.xlsx',5,1,'Germany',True,False)

print(germany_test_rmse)

Trial 10 Complete [00h 00m 34s]
mse: 0.00024674442829564214

Best mse So Far: 0.0002304707159055397
Total elapsed time: 00h 06m 57s
INFO:tensorflow:Oracle triggered exit
Original Values: [3.  3.  3.  3.  3.1 3.1 3.1 3.1 3.  3.  2.9 2.9]
Predicted Values: [3.00688552 2.9441796  2.9        2.9        2.9        2.9
 2.9        2.9        2.9        2.9        2.9        2.9       ]
0.13011639886904694


In [7]:
italy_model, italy_scaler, italy_test_rmse, italy_pre_shape, italy_post_shape=main(r'Datasets\Italy_LSTM_MRD.xlsx',5,1,'Italy',True,False)

print(italy_test_rmse)

Trial 10 Complete [00h 00m 29s]
mse: 0.001212011557072401

Best mse So Far: 0.001164554967544973
Total elapsed time: 00h 06m 21s
INFO:tensorflow:Oracle triggered exit
Original Values: [8.3 8.2 8.1 8.  8.  8.1 8.  7.9 7.9 7.9 8.  8. ]
Predicted Values: [8.44104999 8.29410549 8.24400452 8.21089192 8.06123775 8.00884373
 8.05245563 7.99464988 7.95993867 7.91107864 7.90162231 7.8941192 ]
0.10891038360968584


In [8]:
uk_model, uk_scaler, uk_test_rmse, uk_pre_shape, uk_post_shape=main(r'Datasets\UK_LSTM_MRD.xlsx',5,1,'UK',True,False)

print(uk_test_rmse)

Trial 10 Complete [00h 00m 27s]
mse: 0.0005045221769250929

Best mse So Far: 0.0005045221769250929
Total elapsed time: 00h 06m 38s
INFO:tensorflow:Oracle triggered exit
Original Values: [3.8 3.7 3.8 3.8 3.8 3.6 3.5 3.6 3.7 3.7 3.7 3.7]
Predicted Values: [4.02222247 3.93212927 3.88217597 3.88866465 3.90214238 3.81525025
 3.70331729 3.61036413 3.62926253 3.67095043 3.72269431 3.77864207]
0.13803365471676862


In [9]:
us_model, us_scaler, us_test_rmse, us_pre_shape, us_post_shape=main(r'Datasets\US_LSTM_MRD.xlsx',5,1,'US',True,False)

print(us_test_rmse)

Trial 10 Complete [00h 00m 35s]
mse: 0.0035264750476926565

Best mse So Far: 0.003472226671874523
Total elapsed time: 00h 06m 09s
INFO:tensorflow:Oracle triggered exit
Original Values: [3.6 3.6 3.6 3.5 3.7 3.5 3.7 3.6 3.5 3.4 3.6 3.5]
Predicted Values: [4.33160041 4.62155    4.39613931 4.08394837 3.62891416 3.78196191
 3.59702017 3.73032752 3.66425997 3.5456735  3.45406563 4.06448804]
0.5044740326766116


In [35]:
japan_model, japan_scaler, japan_test_rmse, japan_pre_shape, japan_post_shape=main(r'Datasets\Japan_LSTM_MRD.xlsx',5,5,'Japan',True,False)

print(japan_test_rmse)

Trial 10 Complete [00h 00m 38s]
mse: 0.0012412657961249352

Best mse So Far: 0.0012412657961249352
Total elapsed time: 00h 07m 29s
INFO:tensorflow:Oracle triggered exit
Original Values: [2.6 2.6 2.6 2.6 2.6 2.5 2.6 2.6 2.5 2.5 2.4 2.6]
Predicted Values: [2.64932756 2.59365514 2.56721132 2.55342952 2.54355308 2.54355308
 2.48283333 2.50122999 2.52231394 2.45943856 2.44363935 2.39521947]
0.08178561493853004


# SAVING MODELS

In [26]:
with open(r'Models/US_LSTM_Model.sav','wb') as f:
    pickle.dump(us_model,f)

In [27]:
with open(r'Models/UK_LSTM_Model.sav','wb') as f:
    pickle.dump(uk_model,f)

In [28]:
with open(r'Models/Italy_LSTM_Model.sav','wb') as f:
    pickle.dump(italy_model,f)

In [29]:
with open(r'Models/Germany_LSTM_Model.sav','wb') as f:
    pickle.dump(germany_model,f)

In [30]:
with open(r'Models/France_LSTM_Model.sav','wb') as f:
    pickle.dump(france_model,f)

In [31]:
with open(r'Models/Canada_LSTM_Model.sav','wb') as f:
    pickle.dump(canada_model,f)

In [36]:
with open(r'Models/Japan_LSTM_Model.sav','wb') as f:
    pickle.dump(japan_model,f)