# Processing pipeline

In [1]:
import sys
sys.path.insert(1, '/Users/jakoliendenhollander/capstone/capstone')
import warnings

import pandas as pd
import numpy as np
import datetime

from datetime import timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import tidy_functions.load_data
import tidy_functions.clean_data
import tidy_functions.merge_data
import tidy_functions.feature_engineering

warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None) # To display all columns

## Read in data

In [2]:
# Reading in survey data from csv into a dictionary of dataframes.
dfs_country = tidy_functions.load_data.load_survey_data("/Users/jakoliendenhollander/capstone/capstone/data/CMU_Global_data/Full_Survey_Data/country/smooth/", "country")

# Concatenating individuals dataframes from the dictionary into one dataframe for regions.
survey_data = pd.concat(dfs_country, ignore_index=True)

# Corona stats
covid_cases = pd.read_csv("/Users/jakoliendenhollander/capstone/capstone/data/Corona_stats/owid-covid-data.csv")
print('Read in covid data completed.')

# Mask wearing requirements
mask_wearing_requirements = pd.read_csv("/Users/jakoliendenhollander/capstone/capstone/data/data-nbhtq.csv")
print('Read in mask wearing requirements data completed.')

Read in survey data completed.
Read in covid data completed.
Read in mask wearing requirements data completed.


## Cleaning data

In [3]:
# Survey data
survey_data = tidy_functions.clean_data.delete_other_gender(survey_data)
survey_data = tidy_functions.clean_data.deal_with_NaNs_masks(survey_data)

# Corona stats
covid_cases = tidy_functions.clean_data.deal_with_NaNs_corona_stats(covid_cases)

# Mask wearing requirements
mask_wearing_requirements = tidy_functions.clean_data.prepare_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_public_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_indoors_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_transport_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.data_types_mask_req(mask_wearing_requirements)

# HDI
hdi_data = tidy_functions.clean_data.rename_hdi_countries("/Users/jakoliendenhollander/capstone/capstone/data/","hdro_statistical_data_tables_1_15_d1_d5.xlsx")
dict_hdi = tidy_functions.clean_data.create_hdi_dict(hdi_data)
dict_hdi_levels = tidy_functions.clean_data.create_hdi_levels_dict(hdi_data)

NaNs before update: 152923
NaNs after update: 0
Updated NaNs in wear_mask_all_time.
NaNs removed.
Step 1 of cleaning requirements completed.
Step 2 of cleaning requirements completed.
Step 3 of cleaning requirements completed.
Step 4 of cleaning requirements completed.
Step 5 of cleaning requirements completed.
Step 6 of cleaning requirements completed.
Creating dictionaries for hdi completed.
Creating dictionaries for hdi-levels completed.


## Merging data

In [4]:
covid_merge = tidy_functions.merge_data.merge_corona_stats(survey_data,covid_cases)
requirements_merge = tidy_functions.merge_data.merge_mask_req(covid_merge,mask_wearing_requirements)
hdi_merge = tidy_functions.merge_data.create_hdi_columns(requirements_merge, dict_hdi, dict_hdi_levels)

Merging corona stats completed.
Merging mask wearing requirements completed.
Creating hdi list completed.
Creating hdi-level list completed.


## Feature engineering

In [5]:
date_fixed = tidy_functions.feature_engineering.insert_month(hdi_merge)
requirement_date = tidy_functions.feature_engineering.add_requirement_by_date(date_fixed)

Month column created.
Feature engineering completed.


In [6]:
df = requirement_date.copy()

In [7]:
df = df[df["age_bucket"]=="overall"]
df = df[df["gender"]=="overall"]

In [8]:
date = ["date"]

columns_general = ["iso_code", "hdi", "median_age"]

columns_general_no_iso = ["hdi", "median_age"]

columns_social_distancing = ["smoothed_pct_worked_outside_home_weighted", "smoothed_pct_grocery_outside_home_weighted", "smoothed_pct_ate_outside_home_weighted", 
                             "smoothed_pct_attended_public_event_weighted", "smoothed_pct_used_public_transit_weighted", 
                             "smoothed_pct_direct_contact_with_non_hh_weighted", "smoothed_pct_no_public_weighted"]

columns_mask_wearing = ["smoothed_pct_wear_mask_all_time_weighted", "smoothed_pct_wear_mask_most_time_weighted"]

columns_mask_req = ["cur_mask_recommended", "cur_mask_not_required", "cur_mask_not_required_recommended", "cur_mask_not_required_universal", 
                    "cur_mask_required_part_country", "cur_mask_everywhere_in_public", "cur_mask_public_indoors", "cur_mask_public_transport"]

columns_pred = ["total_cases_per_million"]

columns_interest = date + columns_general + columns_social_distancing + columns_mask_wearing + columns_mask_req + columns_pred

columns_rev_scale = columns_general_no_iso + columns_social_distancing + columns_mask_wearing + columns_mask_req + columns_pred

In [9]:
df_select = df[columns_interest]

In [10]:
# one day ago
previous_day = []

for i in range(len(df_select)):
    
    if df_select.at[df_select.index[i],'date'] > df_select[df_select['iso_code'] == df_select.at[df_select.index[i],'iso_code']].date.min():
        yesterday = df_select.at[df_select.index[i],'date'] - timedelta(days=1)
        iso_code = df_select.at[df_select.index[i],'iso_code']
        
        if yesterday in df_select[df_select["iso_code"] == iso_code].date.values:
            value = df_select.loc[(df_select.date == yesterday) & (df_select.iso_code == iso_code), 'total_cases_per_million']
            previous_day = [*previous_day, *value.values]
        else:
            previous_day = [*previous_day, 'NaN']
    else:
        previous_day = [*previous_day, 'NaN']
        
df_select['previous_day'] = previous_day

In [11]:
# one week ago
previous_7days = []

for i in range(len(df_select)):
    
    if df_select.at[df_select.index[i],'date'] > df_select[df_select['iso_code'] == df_select.at[df_select.index[i],'iso_code']].date.min() + timedelta(days=6):
        last_week = df_select.at[df_select.index[i],'date'] - timedelta(days=7)
        iso_code = df_select.at[df_select.index[i],'iso_code']
        
        if last_week in df_select[df_select["iso_code"] == iso_code].date.values:
            value = df_select.loc[(df_select.date == last_week) & (df_select.iso_code == iso_code), 'total_cases_per_million']
            previous_7days = [*previous_7days, *value.values]
        else:
            previous_7days = [*previous_7days, 'NaN']
    else:
        previous_7days = [*previous_7days, 'NaN']
        
df_select['previous_7days'] = previous_7days

In [12]:
# one month
previous_30days = []

for i in range(len(df_select)):
    
    if df_select.at[df_select.index[i],'date'] > df_select[df_select['iso_code'] == df_select.at[df_select.index[i],'iso_code']].date.min() + timedelta(days=29):
        last_month = df_select.at[df_select.index[i],'date'] - timedelta(days=30)
        iso_code = df_select.at[df_select.index[i],'iso_code']
        
        if last_month in df_select[df_select["iso_code"] == iso_code].date.values:
            value = df_select.loc[(df_select.date == last_month) & (df_select.iso_code == iso_code), 'total_cases_per_million']
            previous_30days = [*previous_30days, *value.values]
        else:
            previous_30days = [*previous_30days, 'NaN']
    else:
        previous_30days = [*previous_30days, 'NaN']
        
df_select['previous_30days'] = previous_30days

In [13]:
df_time = df_select.copy()

In [14]:
df_time.previous_day = pd.to_numeric(df_time.previous_day, errors='coerce')
df_time.previous_7days = pd.to_numeric(df_time.previous_7days, errors='coerce')
df_time.previous_30days = pd.to_numeric(df_time.previous_30days, errors='coerce')

In [15]:
df_time = df_time.dropna()

In [16]:
df_time = df_time.sort_values('date')

In [17]:
df_no_iso = df_time.drop("iso_code", axis=1)
df_no_date = df_no_iso.drop("date", axis=1)

In [18]:
#divide the data into train and test data
train_size = int(len(df_no_date) * 0.80)
test_size = len(df_no_date) - train_size
train, test = df_no_date[0:train_size], df_no_date[train_size:len(df_no_date)]

In [19]:
#index the data into dependent and independent variables
train_X, train_y = train.drop("total_cases_per_million", axis=1), train["total_cases_per_million"]
test_X, test_y =  test.drop("total_cases_per_million", axis=1), test["total_cases_per_million"]
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(14140, 22) (14140,) (3535, 22) (3535,)


In [20]:
#to_scale = ["median_age", "smoothed_pct_worked_outside_home_weighted", "smoothed_pct_grocery_outside_home_weighted", 
#            "smoothed_pct_ate_outside_home_weighted", "smoothed_pct_attended_public_event_weighted", 
#            "smoothed_pct_used_public_transit_weighted", "smoothed_pct_direct_contact_with_non_hh_weighted", 
#            "smoothed_pct_no_public_weighted", "smoothed_pct_wear_mask_all_time_weighted", 
#            "smoothed_pct_wear_mask_most_time_weighted", "previous_day","previous_7days","previous_30days"]

In [21]:
#scale the values
#scaler_X = MinMaxScaler()
#train_X[to_scale] = scaler_X.fit_transform(train_X[to_scale])
#test_X[to_scale] = scaler_X.transform(test_X[to_scale])

In [22]:
#scaler_y = MinMaxScaler()
#scaler_y.fit(train_y)
#train_y = scaler_y.fit_transform(train_y)
#test_y = scaler_y.transform(test_y)

In [23]:
model = RandomForestRegressor(criterion='mae')

In [24]:
# define the target transform wrapper 
wrapped_model = TransformedTargetRegressor(regressor=model,transformer=MinMaxScaler()) 
# use the target transform wrapper 
wrapped_model.fit(train_X, train_y) 
yhat = wrapped_model.predict(test_X)

In [25]:
metrics.mean_absolute_error(test_y, yhat)

207.24249086421474

In [26]:
(207*83)/13600

1.2633088235294119