# Processing pipeline

In [1]:
import sys
sys.path.insert(1, '/Users/jakoliendenhollander/capstone/capstone')
import os

import warnings

import pandas as pd
import numpy as np

import tidy_functions.load_data
import tidy_functions.clean_data
import tidy_functions.merge_data
import tidy_functions.feature_engineering

warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None) # To display all columns

from sklearn.preprocessing import MinMaxScaler

import math 

import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout

from sklearn.metrics import mean_absolute_error

## Read in data

In [2]:
# Reading in survey data from csv into a dictionary of dataframes.
dfs_country = tidy_functions.load_data.load_survey_data("/Users/jakoliendenhollander/capstone/capstone/data/CMU_Global_data/Full_Survey_Data/country/smooth/", "country")

# Concatenating individuals dataframes from the dictionary into one dataframe for regions.
survey_data = pd.concat(dfs_country, ignore_index=True)

# Corona stats
covid_cases = pd.read_csv("/Users/jakoliendenhollander/capstone/capstone/data/Corona_stats/owid-covid-data.csv")
print('Read in covid data completed.')

# Mask wearing requirements
mask_wearing_requirements = pd.read_csv("/Users/jakoliendenhollander/capstone/capstone/data/data-nbhtq.csv")
print('Read in mask wearing requirements data completed.')

Read in survey data completed.
Read in covid data completed.
Read in mask wearing requirements data completed.


## Cleaning data

In [3]:
# Survey data
survey_data = tidy_functions.clean_data.delete_other_gender(survey_data)
survey_data = tidy_functions.clean_data.deal_with_NaNs_masks(survey_data)

# Corona stats
covid_cases = tidy_functions.clean_data.deal_with_NaNs_corona_stats(covid_cases)

# Mask wearing requirements
mask_wearing_requirements = tidy_functions.clean_data.prepare_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_public_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_indoors_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_transport_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.data_types_mask_req(mask_wearing_requirements)

# HDI
hdi_data = tidy_functions.clean_data.rename_hdi_countries("/Users/jakoliendenhollander/capstone/capstone/data/","hdro_statistical_data_tables_1_15_d1_d5.xlsx")
dict_hdi = tidy_functions.clean_data.create_hdi_dict(hdi_data)
dict_hdi_levels = tidy_functions.clean_data.create_hdi_levels_dict(hdi_data)

NaNs before update: 152923
NaNs after update: 0
Updated NaNs in wear_mask_all_time.
NaNs removed.
Step 1 of cleaning requirements completed.
Step 2 of cleaning requirements completed.
Step 3 of cleaning requirements completed.
Step 4 of cleaning requirements completed.
Step 5 of cleaning requirements completed.
Step 6 of cleaning requirements completed.
Creating dictionaries for hdi completed.
Creating dictionaries for hdi-levels completed.


## Merging data

In [4]:
covid_merge = tidy_functions.merge_data.merge_corona_stats(survey_data,covid_cases)
requirements_merge = tidy_functions.merge_data.merge_mask_req(covid_merge,mask_wearing_requirements)
hdi_merge = tidy_functions.merge_data.create_hdi_columns(requirements_merge, dict_hdi, dict_hdi_levels)

Merging corona stats completed.
Merging mask wearing requirements completed.
Creating hdi list completed.
Creating hdi-level list completed.


## Feature engineering

In [5]:
date_fixed = tidy_functions.feature_engineering.insert_month(hdi_merge)
requirement_date = tidy_functions.feature_engineering.add_requirement_by_date(date_fixed)

Month column created.
Feature engineering completed.


In [6]:
# Make a copy of the dataframe
df = requirement_date.copy()

In [7]:
# Select overall rows
df = df[df["age_bucket"]=="overall"]
df = df[df["gender"]=="overall"]

In [8]:
date = ["date"]

columns_general = ["iso_code", "hdi", "median_age"]

columns_general_no_iso = ["hdi", "median_age"]

columns_social_distancing = ["smoothed_pct_worked_outside_home_weighted", "smoothed_pct_grocery_outside_home_weighted", "smoothed_pct_ate_outside_home_weighted", 
                             "smoothed_pct_attended_public_event_weighted", "smoothed_pct_used_public_transit_weighted", 
                             "smoothed_pct_direct_contact_with_non_hh_weighted", "smoothed_pct_no_public_weighted"]

columns_mask_wearing = ["smoothed_pct_wear_mask_all_time_weighted", "smoothed_pct_wear_mask_most_time_weighted"]

columns_mask_req = ["cur_mask_recommended", "cur_mask_not_required", "cur_mask_not_required_recommended", "cur_mask_not_required_universal", 
                    "cur_mask_required_part_country", "cur_mask_everywhere_in_public", "cur_mask_public_indoors", "cur_mask_public_transport"]

columns_pred = ["total_cases_per_million"]

columns_interest = date + columns_general + columns_social_distancing + columns_mask_wearing + columns_mask_req + columns_pred

columns_rev_scale = columns_general_no_iso + columns_social_distancing + columns_mask_wearing + columns_mask_req + columns_pred

In [9]:
df_select = df[columns_interest]

In [10]:
df_select = df_select.sort_values('date')

In [11]:
df_select = df_select.drop("iso_code", axis=1)

In [27]:
#divide the data into train and test data
train_size = int(len(df_select) * 0.80)
test_size = len(df_select) - train_size
train, test = df_select[0:train_size], df_select[train_size:len(df_select)]

In [28]:
to_scale = ["median_age", "smoothed_pct_worked_outside_home_weighted", "smoothed_pct_grocery_outside_home_weighted", 
            "smoothed_pct_ate_outside_home_weighted", "smoothed_pct_attended_public_event_weighted", 
            "smoothed_pct_used_public_transit_weighted", "smoothed_pct_direct_contact_with_non_hh_weighted", 
            "smoothed_pct_no_public_weighted", "smoothed_pct_wear_mask_all_time_weighted", 
            "smoothed_pct_wear_mask_most_time_weighted", "total_cases_per_million"]

In [29]:
#scale the values
scaler = MinMaxScaler()
train[to_scale] = scaler.fit_transform(train[to_scale])
test[to_scale] = scaler.transform(test[to_scale])

In [32]:
# Get a list of all features grouped by date.
train = train.groupby('date').agg(list)
test = test.groupby('date').agg(list)

In [33]:
# Add -1s to ensure the same length of all cells.
for r in range(len(train)):
    for c in range(len(train.columns)):
        if len(train.iloc[r,c]) < 117:
            train.iloc[r,c] = train.iloc[r,c] + [-1]*(117 - len(train.iloc[r,c]))

In [34]:
# Add -1s to ensure the same length of all cells.
for r in range(len(test)):
    for c in range(len(test.columns)):
        if len(test.iloc[r,c]) < 117:
            test.iloc[r,c] = test.iloc[r,c] + [-1]*(117 - len(test.iloc[r,c]))

In [41]:
#index the data into dependent and independent variables
train_X, train_y = train.drop("total_cases_per_million", axis=1), train["total_cases_per_million"]
test_X, test_y =  test.drop("total_cases_per_million", axis=1), test["total_cases_per_million"]
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(152, 19) (152,) (39, 19) (39,)


In [43]:
# Create a numpy array
train_X = train_X.reset_index()
train_X = train_X.to_numpy()
test_X = test_X.reset_index()
test_X = test_X.to_numpy()

train_y = train_y.values
test_y = test_y.values

In [45]:
#convert data into suitable dimension for using it as input in LSTM network
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(152, 1, 20) (152,) (39, 1, 20) (39,)


In [46]:
model = Sequential()
model.add(LSTM(13, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')

history = model.fit(train_X, train_y, epochs=13, batch_size=40, validation_data=(test_X, test_y), verbose=2, shuffle=False)

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type Timestamp).

### Create a list with multiple entries of features per date

In [12]:
df_for_dates = df_select.copy()

In [13]:
# Find the number of entries per feature for the date with the largest number of features.
df_for_dates['date'].value_counts().max()

117

In [14]:
# Get a list of all features grouped by date.
grouped_by_date = df_for_dates.groupby('date').agg(list)

In [16]:
# Add -1s to ensure the same length of all cells.
for r in range(len(grouped_by_date)):
    for c in range(len(grouped_by_date.columns)):
        if len(grouped_by_date.iloc[r,c]) < 117:
            grouped_by_date.iloc[r,c] = grouped_by_date.iloc[r,c] + [-1]*(117 - len(grouped_by_date.iloc[r,c]))

In [23]:
# Create a numpy array
date_groups = grouped_by_date.reset_index()
date_groups = date_groups.to_numpy()