In [51]:
import pandas as pd
import numpy as np 
import sklearn
import geopandas as gpd
from scipy.stats import norm
import time
import os

import xgboost

from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)


In [5]:
# Setting output dictionary
output_dir = '/home/dante/SpatialData/spatial_project/data/output/'


'/home/dante/SpatialData/spatial_project/scripts'

In [11]:
# Loading final data
df = pd.read_csv('/home/dante/SpatialData/spatial_project/data/processed/master_data.csv').iloc[:,0:-1]

In [12]:
df.columns

Index(['province', 'Y-W', 'newcases', 'newcases_tminus1', 'newcases_tminus2',
       'newcases_tminus3', 'newcases_tminus4', 'newcases_tplus1',
       'newcases_tplus2', 'newcases_tplus4', 'spc_tminus1', 'spc_tminus2',
       'spc_tminus3', 'spc_tminus4', 'temp_min_tminus1', 'temp_min_tminus2',
       'temp_min_tminus3', 'temp_min_tminus4', 'temp_max_tminus1',
       'temp_max_tminus2', 'temp_max_tminus3', 'temp_max_tminus4',
       'retail_tminus1', 'retail_tminus2', 'retail_tminus3', 'retail_tminus4',
       'grocery_tminus1', 'grocery_tminus2', 'grocery_tminus3',
       'grocery_tminus4', 'parks_tminus1', 'parks_tminus2', 'parks_tminus3',
       'parks_tminus4', 'transit_tminus1', 'transit_tminus2',
       'transit_tminus3', 'transit_tminus4', 'workplaces_tminus1',
       'workplaces_tminus2', 'workplaces_tminus3', 'workplaces_tminus4',
       'residential_tminus1', 'residential_tminus2', 'residential_tminus3',
       'residential_tminus4', '2019employment_rate',
       '2019intermu

In [68]:
def choosehorizon(dataframe,pred : int,lags):
    """
    This function takes in the master dataframe and spits out the lags that you want to include,
    and then sorts the data so that it's easier to look at.
    """
    if pred not in [1,2,4]:
        print(f"Sorry, but the only available prediction horizons are 1,2 and 4 weeks ahead!")
        return None
    elif (max(lags) > 4) or (min(lags) < 1):
        if len(lags) == 0:
            print("You have specified zero lags! Nonsense!")
        else: print("Your lags are out of bounds...")
    
    lagcols = [col for col in list(dataframe.columns) if 'tminus' in col]
    
    lagschosen = []
    for lag in lags:
        lagschosen += [col for col in lagcols if str(lag) in col]
        
    lagschosen.sort()
    sociocols = [col for col in list(dataframe.columns) if '2019' in col]
    indices = ['Y-W','province']
    y = f"newcases_tplus{pred}"
    
    allcols = indices + [y] + lagschosen + sociocols
    
    # Given lags, take out data that contains NaNs due to data unavailability
    # First week when we have data is 2020-10, so add up weeks according to lags..
    max_lag = max(lags)
    first_week = f"2020-{10+max_lag}"
    
    # Given that we are predicting into the future, we need to take off as many weeks as we
    # are predicting into the future for, because otherwise we have NaNs..
    last_week = f"2022-{10-pred}"
    
    # We also need to take care of the fact that we only have Google Mobility data up to 2021-52
    last_week_google = f"2021-52"
    
    last_effective_week = [last_week,last_week_google]
    last_effective_week.sort()
    dataframe = dataframe[(dataframe['Y-W'] >= first_week) & (dataframe['Y-W'] <= last_effective_week[0])]
    dataframe.reset_index(drop=True,inplace=True)
    return dataframe[allcols]

In [74]:
data = choosehorizon(df,2,[1,2,3,4])

In [76]:
data.shape

(1656, 48)

## Training setup

In [72]:
training_size = 50 # week
testing_size = 1 # week
num_counties = len(data.province.value_counts().index)
time_steps = 14

## Model tracking

In [None]:
train_r2_xgb = dict()
train_rmse_xgb = dict()
train_mae_xgb = dict()
test_rmse_xgb = dict()
test_mae_xgb = dict()
tuned_params_xgb = dict()

## Model grid setup

In [None]:
# Setting Hyperparameters. Please refer to the SI for more information
xgb_params = dict(learning_rate=np.arange(0.05,0.3,0.05), 
                     n_estimators=np.arange(100,1000,100), 
                     gamma = np.arange(1,10,1),
                     subsample = np.arange(0.1,0.5,0.05),
                     max_depth=[int(i) for i in np.arange(1,10,1)]) 

## Model training

In [None]:
for i in range(time_steps):
    
    training_df = data.iloc[:(i+training_size)*num_counties,:]
    testing_df = data.iloc[(i+training_size)*num_counties:(i+training_size+testing_size)*num_counties,:]
    
    