In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import urllib.request
import pickle

import pandas as pd
mpl.rcParams['figure.dpi']= 150


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
import numpy as np 
import matplotlib
import seaborn as sns
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

plt.style.use('ggplot')

import ipywidgets as widgets
from ipywidgets import interact, fixed

In [5]:
HYPOTHETICAL_SUBMISSION_DATE = np.datetime64("2020-09-30")

In [6]:
# Main source for the training data
DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
# Local file
DATA_FILE = 'data/OxCGRT_latest.csv'

def download_historical(url=DATA_URL, file=DATA_FILE):
    """Download data located at 'url' and save them at location 'file'
    >>> from transat.data.load import download_historical
    >>> download_historical()
    """
    if not os.path.exists('data'):
        os.mkdir('data')
    urllib.request.urlretrieve(url, file)

def load_historical(file=DATA_FILE):
    """Load historical data located at 'file'
    >>> from transat.data.load import load_historical
    >>> df = load_historical()
    """
    # Load historical data from local file
    df = pd.read_csv(file,
        parse_dates=['Date'],
        encoding="ISO-8859-1",
        dtype={"RegionName": str,
               "RegionCode": str},
        error_bad_lines=False)
    return df

In [7]:
download_historical()
df = load_historical()

In [9]:
df.rename(columns=lambda x: x.replace(' ','_').replace('/','_'), inplace=True)

In [10]:
#Assignate the correct type
df.Date = pd.to_datetime(df['Date'], format='%Y%m%d')
df = df.astype({'C1_School_closing':'category', 'C2_Workplace_closing':'category', 
           'C3_Cancel_public_events':'category','C4_Restrictions_on_gatherings':'category',
           'C5_Close_public_transport':'category', 'C6_Stay_at_home_requirements':'category', 
           'C7_Restrictions_on_internal_movement':'category', 'C8_International_travel_controls':'category', 
           'E1_Income_support':'category', 'E2_Debt_contract_relief':'category', 
           'H1_Public_information_campaigns':'category', 'H2_Testing_policy':'category', 
           'H3_Contact_tracing':'category',  'CountryCode':'category',
           'H6_Facial_Coverings':'category', 'H7_Vaccination_policy':'category', 'M1_Wildcard':'category'})

"df = df.astype({'C1_School_closing':'category', 'C2_Workplace_closing':'category', \n           'C3_Cancel_public_events':'category','C4_Restrictions_on_gatherings':'category',\n           'C5_Close_public_transport':'category', 'C6_Stay_at_home_requirements':'category', \n           'C7_Restrictions_on_internal_movement':'category', 'C8_International_travel_controls':'category', \n           'E1_Income_support':'category', 'E2_Debt_contract_relief':'category', \n           'H1_Public_information_campaigns':'category', 'H2_Testing_policy':'category', \n           'H3_Contact_tracing':'category',  'CountryCode':'category',\n           'H6_Facial_Coverings':'category', 'H7_Vaccination_policy':'category', 'M1_Wildcard':'category'})"

In [11]:
#Jurisdiction, M1 and all the indexes can be dropped
df.drop(['Jurisdiction','M1_Wildcard', 'StringencyIndex', 'StringencyIndexForDisplay', 'StringencyLegacyIndex', 
        'StringencyLegacyIndexForDisplay', 'GovernmentResponseIndex', 'GovernmentResponseIndexForDisplay',
        'ContainmentHealthIndex', 'ContainmentHealthIndexForDisplay', 'EconomicSupportIndex', 
        'EconomicSupportIndexForDisplay'], axis=1, inplace=True)

In [12]:
#Create feature for CountryName and Region
df['CountryRegion'] = df.RegionName
df.CountryRegion = df.CountryRegion.fillna(df.CountryName)

In [13]:
#Delete USA, UK only country data
indexes = list(df[((df.CountryRegion=='United States') | (df.CountryRegion=='United Kingdom'))].index)

#Delete Brazil and Canada regions
indexes.extend(df[((df.CountryName=='Brazil') & (df.RegionName.isna()==False))].index)
indexes.extend(df[((df.CountryName=='Canada') & (df.RegionName.isna()==False))].index)

#Delete Turkmenistan, Tonga and Malta
indexes.extend(df[((df.CountryName=='Tonga') | (df.CountryName=='Turkmenistan')) | (df.CountryName=='Malta')].index)

df.drop(labels = indexes, axis=0, inplace=True)

In [14]:
#Create new variables for day, month and year
df['Month'] = df.Date.apply(lambda x: x.month)
df['Year'] = df.Date.apply(lambda x: x.year)
df['Day'] = df.Date.apply(lambda x: x.day)

In [15]:
#Initialized values to 0 in categorical variables to then fill na with the previous value
df.loc[(df['Date']=='2020-01-01'), 'C1_School_closing'] = 0
df.loc[(df['Date']=='2020-01-01'), 'C2_Workplace_closing'] = 0
df.loc[(df['Date']=='2020-01-01'), 'C3_Cancel_public_events'] = 0
df.loc[(df['Date']=='2020-01-01'), 'C4_Restrictions_on_gatherings'] = 0
df.loc[(df['Date']=='2020-01-01'), 'C5_Close_public_transport'] = 0
df.loc[(df['Date']=='2020-01-01'), 'C6_Stay_at_home_requirements'] = 0
df.loc[(df['Date']=='2020-01-01'), 'C7_Restrictions_on_internal_movement'] = 0
df.loc[(df['Date']=='2020-01-01'), 'C8_International_travel_controls'] = 0
df.loc[(df['Date']=='2020-01-01'), 'E1_Income_support'] = 0
df.loc[(df['Date']=='2020-01-01'), 'E2_Debt_contract_relief'] = 0
df.loc[(df['Date']=='2020-01-01'), 'H1_Public_information_campaigns'] = 0
df.loc[(df['Date']=='2020-01-01'), 'H2_Testing_policy'] = 0
df.loc[(df['Date']=='2020-01-01'), 'H3_Contact_tracing'] = 0
df.loc[(df['Date']=='2020-01-01'), 'H6_Facial_Coverings'] = 0
df.loc[(df['Date']=='2020-01-01'), 'H7_Vaccination_policy'] = 0

In [16]:
#Fill NaN values by 0
df.C1_School_closing.fillna(method='ffill', inplace=True)
df.C2_Workplace_closing.fillna(method='ffill', inplace=True)
df.C3_Cancel_public_events.fillna(method='ffill', inplace=True)
df.C4_Restrictions_on_gatherings.fillna(method='ffill', inplace=True)
df.C5_Close_public_transport.fillna(method='ffill', inplace=True)
df.C6_Stay_at_home_requirements.fillna(method='ffill', inplace=True)
df.C7_Restrictions_on_internal_movement.fillna(method='ffill', inplace=True)
df.C8_International_travel_controls.fillna(method='ffill', inplace=True)
df.E1_Income_support.fillna(method='ffill', inplace=True)
df.E2_Debt_contract_relief.fillna(method='ffill', inplace=True)
df.E3_Fiscal_measures.fillna(0, inplace=True)
df.E4_International_support.fillna(0, inplace=True)
df.H1_Public_information_campaigns.fillna(method='ffill', inplace=True)
df.H2_Testing_policy.fillna(method='ffill', inplace=True)
df.H3_Contact_tracing.fillna(method='ffill', inplace=True)
df.H4_Emergency_investment_in_healthcare.fillna(0, inplace=True)
df.H5_Investment_in_vaccines.fillna(0, inplace=True)
df.H6_Facial_Coverings.fillna(method='ffill', inplace=True)
df.H7_Vaccination_policy.fillna(method='ffill', inplace=True)

In [17]:
flags = ['C1_Flag', 'C2_Flag', 'C3_Flag', 'C4_Flag', 'C5_Flag', 'C6_Flag', 'C7_Flag', 'E1_Flag', 'H1_Flag', 'H6_Flag', 'H7_Flag']
for col in flags:
    df[col].fillna(2, inplace=True)
    df[col] = df[col].apply(lambda x : 1 if x==1 else (0.5 if x==0 else 0))

In [18]:
#Confirmed cases fill na with previous values
df.loc[(df['Date']=='2020-01-01'), 'ConfirmedCases'] = 0
df.ConfirmedCases.fillna(method='ffill', inplace=True)

In [19]:
#Confirmed deaths fill na with previous values
df.loc[(df['Date']=='2020-01-01'), 'ConfirmedDeaths'] = 0
df.ConfirmedDeaths.fillna(method='ffill', inplace=True)

In [20]:
df.columns

Index(['CountryName', 'CountryCode', 'RegionName', 'RegionCode', 'Date', 'C1_School_closing', 'C1_Flag', 'C2_Workplace_closing', 'C2_Flag', 'C3_Cancel_public_events', 'C3_Flag', 'C4_Restrictions_on_gatherings', 'C4_Flag', 'C5_Close_public_transport', 'C5_Flag', 'C6_Stay_at_home_requirements', 'C6_Flag', 'C7_Restrictions_on_internal_movement', 'C7_Flag', 'C8_International_travel_controls', 'E1_Income_support', 'E1_Flag', 'E2_Debt_contract_relief', 'E3_Fiscal_measures', 'E4_International_support', 'H1_Public_information_campaigns', 'H1_Flag', 'H2_Testing_policy', 'H3_Contact_tracing', 'H4_Emergency_investment_in_healthcare', 'H5_Investment_in_vaccines', 'H6_Facial_Coverings', 'H6_Flag', 'H7_Vaccination_policy', 'H7_Flag', 'ConfirmedCases', 'ConfirmedDeaths', 'CountryRegion', 'Month', 'Year', 'Day'], dtype='object')

#Create Cateorical variable to countries. from 1 to 182.
df['CountryCodeCat'] = df.CountryCode.cat.codes

In [23]:
#Number of nan
for col in df.columns:
    print(col, df[col].isnull().sum())

CountryName 0
CountryCode 0
RegionName 64977
RegionCode 64977
Date 0
C1_School_closing 0
C1_Flag 0
C2_Workplace_closing 0
C2_Flag 0
C3_Cancel_public_events 0
C3_Flag 0
C4_Restrictions_on_gatherings 0
C4_Flag 0
C5_Close_public_transport 0
C5_Flag 0
C6_Stay_at_home_requirements 0
C6_Flag 0
C7_Restrictions_on_internal_movement 0
C7_Flag 0
C8_International_travel_controls 0
E1_Income_support 0
E1_Flag 0
E2_Debt_contract_relief 0
E3_Fiscal_measures 0
E4_International_support 0
H1_Public_information_campaigns 0
H1_Flag 0
H2_Testing_policy 0
H3_Contact_tracing 0
H4_Emergency_investment_in_healthcare 0
H5_Investment_in_vaccines 0
H6_Facial_Coverings 0
H6_Flag 0
H7_Vaccination_policy 0
H7_Flag 0
ConfirmedCases 0
ConfirmedDeaths 0
CountryRegion 0
Month 0
Year 0
Day 0


# Model

In [24]:
def split_historical(df, split_date):
    df_train = df[df.Date <= split_date]
    df_test = df[df.Date > split_date]
    return df_train, df_test

In [25]:
print("Spliting at : ", HYPOTHETICAL_SUBMISSION_DATE)
df_train, df_test = split_historical(df, HYPOTHETICAL_SUBMISSION_DATE)

Spliting at :  2020-09-30


In [26]:
df_train.columns

Index(['CountryName', 'CountryCode', 'RegionName', 'RegionCode', 'Date', 'C1_School_closing', 'C1_Flag', 'C2_Workplace_closing', 'C2_Flag', 'C3_Cancel_public_events', 'C3_Flag', 'C4_Restrictions_on_gatherings', 'C4_Flag', 'C5_Close_public_transport', 'C5_Flag', 'C6_Stay_at_home_requirements', 'C6_Flag', 'C7_Restrictions_on_internal_movement', 'C7_Flag', 'C8_International_travel_controls', 'E1_Income_support', 'E1_Flag', 'E2_Debt_contract_relief', 'E3_Fiscal_measures', 'E4_International_support', 'H1_Public_information_campaigns', 'H1_Flag', 'H2_Testing_policy', 'H3_Contact_tracing', 'H4_Emergency_investment_in_healthcare', 'H5_Investment_in_vaccines', 'H6_Facial_Coverings', 'H6_Flag', 'H7_Vaccination_policy', 'H7_Flag', 'ConfirmedCases', 'ConfirmedDeaths', 'CountryRegion', 'Month', 'Year', 'Day'], dtype='object')

'ConfirmedCases', 'Date', 'CountryName', 'CountryCode', 'RegionName', 'RegionCode',
                         'CountryRegion','C1_School_closing', 'C2_Workplace_closing', 'C3_Cancel_public_events', 
                         'C4_Restrictions_on_gatherings', 'C5_Close_public_transport', 
                         'C6_Stay_at_home_requirements', 'C7_Restrictions_on_internal_movement', 
                         'C8_International_travel_controls', 'E1_Income_support', 'E2_Debt_contract_relief',
                         'E3_Fiscal_measures', 'E4_International_support', 'H1_Public_information_campaigns',
                         'H2_Testing_policy', 'H3_Contact_tracing', 'H4_Emergency_investment_in_healthcare', 
                         'H5_Investment_in_vaccines', 'H6_Facial_Coverings', 'H7_Vaccination_policy', 
                         'ConfirmedDeaths'

In [27]:
X_train = df_train.drop(['ConfirmedCases', 'Date', 'CountryName', 'CountryCode', 'RegionName', 'RegionCode',
                         'CountryRegion', 'ConfirmedDeaths'], axis=1)
y_train = df_train.ConfirmedCases 
X_test = df_test.drop(['ConfirmedCases', 'Date', 'CountryName', 'CountryCode', 'RegionName', 'RegionCode',
                         'CountryRegion', 'ConfirmedDeaths'], axis=1)
y_test = df_test.ConfirmedCases 

ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators=500)
ada.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=6),
                  n_estimators=500)

with open('ada_boost.pkl', 'wb') as model_file:
            pickle.dump(ada, model_file)

with open('ada_boost.pkl', 'rb') as model_file:
            ada_pickle = pickle.load(model_file)

In [28]:
# Predict
y_ada = ada.predict(X_test)

In [29]:
def mae(pred, true):
        """
            Compute Mean Average Error between predictions and groundtruth
        """
        return np.mean(np.abs(pred - true))

In [30]:
mae(y_ada, y_test)

189450.5140816878

In [36]:
mae(y_ada, y_test)

177242.67775544536

In [28]:
mae(y_ada, y_test)

254792.04155696748