In [51]:
import pandas as pd
import numpy as np 
import sklearn
import geopandas as gpd
from scipy.stats import norm
import time
import os

import xgboost

from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)


In [5]:
# Setting output dictionary
output_dir = '/home/dante/SpatialData/spatial_project/data/output/'


'/home/dante/SpatialData/spatial_project/scripts'

In [11]:
# Loading final data
df = pd.read_csv('/home/dante/SpatialData/spatial_project/data/processed/master_data.csv').iloc[:,0:-1]

In [12]:
df.columns

Index(['province', 'Y-W', 'newcases', 'newcases_tminus1', 'newcases_tminus2',
       'newcases_tminus3', 'newcases_tminus4', 'newcases_tplus1',
       'newcases_tplus2', 'newcases_tplus4', 'spc_tminus1', 'spc_tminus2',
       'spc_tminus3', 'spc_tminus4', 'temp_min_tminus1', 'temp_min_tminus2',
       'temp_min_tminus3', 'temp_min_tminus4', 'temp_max_tminus1',
       'temp_max_tminus2', 'temp_max_tminus3', 'temp_max_tminus4',
       'retail_tminus1', 'retail_tminus2', 'retail_tminus3', 'retail_tminus4',
       'grocery_tminus1', 'grocery_tminus2', 'grocery_tminus3',
       'grocery_tminus4', 'parks_tminus1', 'parks_tminus2', 'parks_tminus3',
       'parks_tminus4', 'transit_tminus1', 'transit_tminus2',
       'transit_tminus3', 'transit_tminus4', 'workplaces_tminus1',
       'workplaces_tminus2', 'workplaces_tminus3', 'workplaces_tminus4',
       'residential_tminus1', 'residential_tminus2', 'residential_tminus3',
       'residential_tminus4', '2019employment_rate',
       '2019intermu

In [93]:
def choosehorizon(dataframe,pred : int,lags):
    """
    This function takes in the master dataframe and spits out the lags that you want to include,
    and then sorts the data so that it's easier to look at.
    """
    if pred not in [1,2,4]:
        print(f"Sorry, but the only available prediction horizons are 1,2 and 4 weeks ahead!")
        return None
    elif (max(lags) > 4) or (min(lags) < 1):
        if len(lags) == 0:
            print("You have specified zero lags! Nonsense!")
        else: print("Your lags are out of bounds...")
    
    lagcols = [col for col in list(dataframe.columns) if 'tminus' in col]
    
    lagschosen = []
    for lag in lags:
        lagschosen += [col for col in lagcols if str(lag) in col]
        
    lagschosen.sort()
    sociocols = [col for col in list(dataframe.columns) if '2019' in col]
    indices = ['Y-W','province']
    y = f"newcases_tplus{pred}"
    
    allcols = indices + [y] + lagschosen + sociocols
    
    # Given lags, take out data that contains NaNs due to data unavailability
    # First week when we have data is 2020-10, so add up weeks according to lags..
    max_lag = max(lags)
    first_week = f"2020-{10+max_lag}"
    
    # Given that we are predicting into the future, we need to take off as many weeks as we
    # are predicting into the future for, because otherwise we have NaNs..
    last_week = f"2022-{10-pred}"
    
    # We also need to take care of the fact that we only have Google Mobility data up to 2021-52
    last_week_google = f"2021-52"
    
    last_effective_week = [last_week,last_week_google]
    last_effective_week.sort()
    dataframe = dataframe[(dataframe['Y-W'] >= first_week) & (dataframe['Y-W'] <= last_effective_week[0])]
    dataframe.reset_index(drop=True,inplace=True)
    return dataframe[allcols],y

In [94]:
data,depvarname = choosehorizon(df,2,[1,2,3,4])

In [95]:
data.head()

Unnamed: 0,Y-W,province,newcases_tplus2,grocery_tminus1,grocery_tminus2,grocery_tminus3,grocery_tminus4,newcases_tminus1,newcases_tminus2,newcases_tminus3,...,transit_tminus4,workplaces_tminus1,workplaces_tminus2,workplaces_tminus3,workplaces_tminus4,2019employment_rate,2019intermunicipal_migration_rate,2019annual_contrib_margin,2019share_age_64,2019number_workplaces
0,2020-14,South Karelia,0.0,-19.571429,-13.142857,3.857143,4.0,0.0,0.0,0.0,...,-2.0,-39.571429,-33.428571,-7.857143,-0.714286,67.4,-488,387.3,27.4,47995
1,2020-14,Southern Ostrobothnia,0.0,-17.285714,-13.0,2.0,2.714286,0.0,0.0,0.0,...,-1.428571,-32.571429,-27.857143,-5.285714,-1.285714,73.8,-935,123.6,25.4,77419
2,2020-14,Southern Savonia,0.0,-15.142857,-9.0,6.142857,4.571429,0.561653,0.0,0.0,...,-4.285714,-35.571429,-31.285714,-6.285714,-1.285714,68.9,-1228,143.5,30.9,50145
3,2020-14,Kainuu,0.0,-17.428571,-7.285714,7.857143,11.428571,0.749679,0.0,0.0,...,-2.142857,-35.428571,-29.857143,-6.571429,-20.857143,68.3,-462,857.2,28.5,27218
4,2020-14,Tavastia Proper,0.0,-17.428571,-11.857143,2.285714,2.714286,0.0,0.0,0.25706,...,-3.857143,-36.285714,-31.714286,-6.571429,0.857143,73.1,-69,197.5,25.3,64199


## Training setup

In [96]:
training_size = 30 # week
testing_size = 1 # week
num_counties = len(data.province.value_counts().index)
time_steps = 5

## Model tracking

In [97]:
train_r2_xgb = dict()
train_rmse_xgb = dict()
train_mae_xgb = dict()
test_rmse_xgb = dict()
test_mae_xgb = dict()
tuned_params_xgb = dict()

## Model grid setup

In [98]:
# Setting Hyperparameters. Please refer to the SI for more information
xgb_params = dict(learning_rate=np.arange(0.05,0.3,0.05), 
                     n_estimators=np.arange(100,1000,100), 
                     gamma = np.arange(1,10,1),
                     subsample = np.arange(0.1,0.5,0.05),
                     max_depth=[int(i) for i in np.arange(1,10,1)]) 

## Model training

In [102]:
for i in range(time_steps):
    
    training_df = data.iloc[:(i+training_size)*num_counties,:]
    testing_df = data.iloc[(i+training_size)*num_counties:(i+training_size+testing_size)*num_counties,:]
    
#     start_time = time.time()

    # in the 2-week prediction model, the target variable is LOG_DELTA_INC_RATE_T_14
    X_train = training_df.iloc[:,3:]
    y_train = training_df[depvarname]
    X_test = testing_df.iloc[:,3:]
    y_test = testing_df[depvarname]
    
    print(X_train.shape)
    print(y_train.shape)
    
    print(X_test.shape)
    print(y_test.shape)
    
    #scaling X
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    
    #inititalization
    xgb_model = xgboost.XGBRegressor(seed=42, verbosity=3)
    
    #cross validation
    xgb_cv = RandomizedSearchCV(xgb_model, xgb_params, random_state=21, 
                                    scoring='neg_root_mean_squared_error', n_jobs=-1)
    
    xgb_optimized = xgb_cv.fit(X_train, y_train)
    best_xgb = xgb_optimized.best_estimator_
    tuned_params_xgb['whole', i] = xgb_optimized.best_params_
    
    # model evaluation for training set
    r2_train_xgb = round(best_xgb.score(X_train, y_train),2)
    train_r2_xgb['whole', i] = r2_train_xgb
    
#     y_train_predicted_xgb = best_xgb.predict(X_train)
#     rmse_train_xgb = (np.sqrt(mean_squared_error(y_train, y_train_predicted_xgb)))
#     train_rmse_xgb_14[model, i] = rmse_train_xgb
#     train_mae_xgb_14[model, i] =  mean_absolute_error(y_train, y_train_predicted_xgb)

    print(f"Finished {i}th round...")

(540, 45)
(540,)
(18, 45)
(18,)
[19:18:45] DEBUG: ../src/gbm/gbtree.cc:155: Using tree method: 2
[19:18:46] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 6 extra nodes, 0 pruned nodes, max_depth=2
[19:18:47] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 2 pruned nodes, max_depth=2
[19:18:47] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 2 extra nodes, 4 pruned nodes, max_depth=1
[19:18:48] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 2 pruned nodes, max_depth=2
[19:18:49] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 2 extra nodes, 4 pruned nodes, max_depth=1
[19:18:50] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 2 pruned nodes, max_depth=2
[19:18:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 2 pruned nodes, max_depth=2
[19:18:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 2 extra nodes, 4 pruned nodes, max_depth=1
[19:18:

[19:18:45] DEBUG: ../src/gbm/gbtree.cc:155: Using tree method: 2
[19:18:46] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 2 pruned nodes, max_depth=2
[19:18:47] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 2 pruned nodes, max_depth=2
[19:18:48] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 2 extra nodes, 4 pruned nodes, max_depth=1
[19:18:49] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 2 pruned nodes, max_depth=2
[19:18:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 2 pruned nodes, max_depth=2
[19:18:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 2 pruned nodes, max_depth=2
[19:18:53] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 2 extra nodes, 4 pruned nodes, max_depth=1
[19:18:54] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 2 extra nodes, 4 pruned nodes, max_depth=1
[19:18:55] INFO: ../src/tree/updater_pr

[19:19:49] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:19:50] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:19:50] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:19:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:19:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:19:53] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:19:54] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:19:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:19:56] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_

[19:18:45] DEBUG: ../src/gbm/gbtree.cc:155: Using tree method: 2
[19:18:46] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 4 pruned nodes, max_depth=0
[19:18:47] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 8 pruned nodes, max_depth=0
[19:18:49] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 20 pruned nodes, max_depth=0
[19:18:51] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 16 pruned nodes, max_depth=0
[19:18:53] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 20 pruned nodes, max_depth=0
[19:18:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 24 pruned nodes, max_depth=0
[19:18:57] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 22 pruned nodes, max_depth=0
[19:18:59] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 14 pruned nodes, max_depth=0
[19:19:01] INFO: ../src/tree/upda

[19:18:45] DEBUG: ../src/gbm/gbtree.cc:155: Using tree method: 2
[19:18:47] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 16 pruned nodes, max_depth=2
[19:18:49] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 24 pruned nodes, max_depth=0
[19:18:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 30 pruned nodes, max_depth=0
[19:18:54] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 22 pruned nodes, max_depth=0
[19:18:56] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 24 pruned nodes, max_depth=0
[19:18:59] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 2 extra nodes, 34 pruned nodes, max_depth=1
[19:19:01] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 30 pruned nodes, max_depth=0
[19:19:04] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 18 pruned nodes, max_depth=0
[19:19:06] INFO: ../src/tree/up

[19:18:45] DEBUG: ../src/gbm/gbtree.cc:155: Using tree method: 2
[19:18:47] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 2 extra nodes, 22 pruned nodes, max_depth=1
[19:18:50] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 24 pruned nodes, max_depth=0
[19:18:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 32 pruned nodes, max_depth=0
[19:18:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 30 pruned nodes, max_depth=0
[19:18:58] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 28 pruned nodes, max_depth=0
[19:19:00] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 36 pruned nodes, max_depth=0
[19:19:03] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 30 pruned nodes, max_depth=0
[19:19:05] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 20 pruned nodes, max_depth=0
[19:19:08] INFO: ../src/tree/up

[19:20:54] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:20:54] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:20:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:20:56] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:20:57] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:20:57] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:20:58] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:20:59] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:21:00] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_

[19:21:54] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:21:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:21:56] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:21:57] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:21:58] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:21:59] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:22:00] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:22:01] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 4 pruned nodes, max_depth=0
[19:22:01] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_

[19:22:02] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:22:03] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:22:04] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:22:05] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:22:06] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:22:07] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:22:08] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:22:09] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_depth=0
[19:22:09] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 6 pruned nodes, max_

Exception ignored on calling ctypes callback function: <function _log_callback at 0x7fe214cd60d0>
Traceback (most recent call last):
  File "/home/dante/.local/lib/python3.8/site-packages/xgboost/core.py", line 143, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function: <function _log_callback at 0x7faec4f8d0d0>
Traceback (most recent call last):
  File "/home/dante/.local/lib/python3.8/site-packages/xgboost/core.py", line 143, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


KeyboardInterrupt: 