In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import models, optimizers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from datetime import datetime as dt
from sklearn.model_selection import KFold
import import_ipynb
from Model_dev import compress_to_2d, get_model, LSTM_model, Attention, C_LSTM, data_handling, model_development, rmse, draw_graph
import copy

In [None]:
country = 'Mexico'
datadir = os.path.join(os.getcwd(), "Data")
modeldir = os.path.join(os.getcwd(), "Models")
logdir = os.path.join(os.getcwd(), "Log")
filename = "Full_{}.csv".format(country)
filepath = os.path.join(datadir, filename)
social_media = ['like_index','retweet_index']
covid_cases = ['ConfirmedCases', 'ConfirmedDeaths', 'Daily_cases']
general_info = ['CountryCode_x', 'CountryName_x', 'Jurisdiction', 'Date']
num_variable = ['E3_Fiscal measures', 'E4_International support', 'H5_Investment in vaccines', 'H4_Emergency investment in healthcare']
required_days = 14
pred_days = 7

In [None]:
if not os.path.isdir(modeldir):
    os.mkdir(modeldir)
if not os.path.isdir(logdir):
    os.mkdir(logdir)
if not os.path.isdir(datadir):
    os.mkdir(datadir)
if not os.path.isdir(os.path.join(logdir, country)):
    os.mkdir(os.path.join(logdir, country))

In [None]:
df = pd.read_csv(filepath, index_col=0)
df.set_index('Date', inplace=True)

policy = []
columns = df.columns
for column in columns:
    if not any(column in _list for _list in [social_media, general_info, covid_cases]):
        policy.append(column)
categorical_variable = list(set(policy) - set(num_variable))
df[categorical_variable] = df[categorical_variable].astype("category")

In [None]:
features = policy
x_train, x_test, y_train, y_test = data_handling(df, features, required_days, pred_days)
model_development(x_train, x_test, y_train, y_test, "P", country, modeldir, logdir, required_days, pred_days)

In [None]:
features = policy + covid_cases
x_train, x_test, y_train, y_test = data_handling(df, features, required_days, pred_days)
model_development(x_train, x_test, y_train, y_test, "PC", country, modeldir, logdir, required_days, pred_days)

In [None]:
features = policy + covid_cases + social_media
x_train, x_test, y_train, y_test = data_handling(df, features, required_days, pred_days)
model_development(x_train, x_test, y_train, y_test, "PCS", country, modeldir, logdir, required_days, pred_days)

In [None]:
### Best model for the country
log_file = os.path.join(logdir,"Models.csv")
best_result = get_model(log_file, country)

features_used = best_result['Features'].values[0]
algo = best_result['Algorithm'].values[0]
best_model_filepath = best_result['Model_path'].values[0]
best_model = models.load_model(best_model_filepath, custom_objects={'rmse':rmse})
config = best_model.get_config()

if features_used == "P":
    features = policy
elif features_used == "PC":
    features = policy + covid_cases
else:
    features = policy + covid_cases + social_media

x_train, x_test, y_train, y_test = data_handling(df, features, required_days, pred_days)
y_pred_train = compress_to_2d(best_model.predict(x_train))
y_pred_test = compress_to_2d(best_model.predict(x_test))
y_true_train = compress_to_2d(y_train)
y_true_test = compress_to_2d(y_test)

mse = tf.keras.losses.MeanSquaredError()
loss = mse(y_true_train, y_pred_train).numpy()
val_loss = mse(y_true_test, y_pred_test).numpy()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=100, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=10, factor=0.4, min=0.00001)
opt = optimizers.Adam(learning_rate=0.01)

In [None]:
### Feature importance for the country
result = {}

for i in range(len(policy)):
    
    target = policy[i]
    x_train, x_test, y_train, y_test = data_handling(df, features, required_days, pred_days, swap_col=target)
    
    exp_train = compress_to_2d(best_model.predict(x_train))
    exp_test = compress_to_2d(best_model.predict(x_test))
    
    draw_graph(exp_train, exp_test, y_true_train, y_true_test, required_days, pred_days, country, features_used, x_train, algo)
    
    _loss = mse(y_true_train, exp_train).numpy()
    _val_loss = mse(y_true_test, exp_test).numpy()
    
    print("######", target, "#####")
    print("Changes in loss: {}".format(_loss-loss))
    print("Changes in Validation loss: {}".format(_val_loss-val_loss))
    
    result[target] = [loss, val_loss, _loss, _val_loss, _loss-loss, _val_loss-val_loss, _loss/loss, _val_loss/val_loss]

Changes = pd.DataFrame.from_dict(result, orient='index', columns=['Old loss', 'Old Validation loss', 'New Loss', 'New Validation Loss', 'Changes in loss', 
                                                                 'Changes in validation loss', 'Ratio of new-old loss', 'Ratio of new-old validation loss'])
Changes.to_csv(os.path.join(logdir, "{}/{}_feat_importance.csv".format(country, country)))

In [None]:
### Feature importance for the country each month
months = ['Jan', 'Feb', 'March', 'April', 'May', 'June', 'July', 'August', 'Sept']

for i in range(len(months)):

    tmp = df[pd.to_datetime(df.index, dayfirst=True) < dt(2021, i+2, 1)]
    
    x_train, x_test, y_train, y_test = data_handling(tmp, features, required_days, pred_days, test_size=30)
    y_pred_train = compress_to_2d(best_model.predict(x_train))
    y_pred_test = compress_to_2d(best_model.predict(x_test))
    y_true_train = compress_to_2d(y_train)
    y_true_test = compress_to_2d(y_test)

    mse = tf.keras.losses.MeanSquaredError()
    loss = mse(y_true_train, y_pred_train).numpy()
    val_loss = mse(y_true_test, y_pred_test).numpy()
    
    result2 = {}
    
    print("######", months[i], "#####")

    for j in range(len(policy)):
        
        target = policy[j]
        xx_train, xx_test, yy_train, yy_test = data_handling(tmp, features, required_days, pred_days, swap_col=target, test_size=30)
    
        exp_train = compress_to_2d(best_model.predict(xx_train))
        exp_test = compress_to_2d(best_model.predict(xx_test))
    
        draw_graph(exp_train, exp_test, y_true_train, y_true_test, required_days, pred_days, country, features_used, xx_train, algo)

        _loss = mse(y_true_train, exp_train).numpy()
        _val_loss = mse(y_true_test, exp_test).numpy()

        print("######", target, "#####")
        print("Changes in loss: {}".format(_loss-loss))
        print("Changes in Validation loss: {}".format(_val_loss-val_loss))

        result2[target] = [loss, val_loss, _loss, _val_loss, _loss-loss, _val_loss-val_loss, _loss/loss, _val_loss/val_loss]

    Changes = pd.DataFrame.from_dict(result2, orient='index', columns=['Old loss', 'Old Validation loss', 'New Loss', 'New Validation Loss', 'Changes in loss', 
                                                                 'Changes in validation loss', 'Ratio of new-old loss', 'Ratio of new-old validation loss'])
    Changes.to_csv(os.path.join(logdir, "{}/{}_feat_importance_{}.csv".format(country, country, months[i])))

### Feature importance for the country
result2 = {}

for i in range(len(policy)):
    
    target = policy[i]
    x_train, x_test, y_train, y_test = data_handling(df, features, required_days, pred_days, swap_col=target)
        
    if algo == 'Attention':
        model = keras.Model.from_config(config)
    else:
        model = keras.Sequential.from_config(config)

    model.compile(optimizer=opt, loss=rmse)
    model.summary()
    
    kf = KFold()
    for train_index, test_index in kf.split(x_train):
        xx_train, xx_test = x_train[train_index], x_train[test_index]
        yy_train, yy_test = y_train[train_index], y_train[test_index]
        
        history = model.fit(xx_train, yy_train, shuffle=True,
                 validation_data=(xx_test, yy_test),
                 epochs=1000,
                 callbacks=[early_stopping, reduce_lr],
                 verbose=0)
    
    exp_train = compress_to_2d(model.predict(x_train))
    exp_test = compress_to_2d(model.predict(x_test))

    _loss = mse(y_true_train, exp_train).numpy()
    _val_loss = mse(y_true_test, exp_test).numpy()
    
    print("######", target, "#####")
    print("Changes in loss: {}".format(_loss-loss))
    print("Changes in Validation loss: {}".format(_val_loss-val_loss))
    
    result2[target] = [loss, val_loss, _loss, _val_loss, _loss-loss, _val_loss-val_loss, _loss/loss, _val_loss/val_loss]

Changes = pd.DataFrame.from_dict(result2, orient='index', columns=['Old loss', 'Old Validation loss', 'New Loss', 'New Validation Loss', 'Changes in loss', 
                                                                 'Changes in validation loss', 'Ratio of new-old loss', 'Ratio of new-old validation loss'])
Changes.to_csv(os.path.join(logdir, "{}_feat_importance.csv".format(country)))