In [1]:
# Import necessary tools
import pandas as pd
import numpy as np
import sklearn
from scipy.stats import norm
import time
import os
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

In [2]:
# Load the data into a pandas dataframe. You may have to change the file path to fit your computer
dataDF = pd.read_csv('/content/all_features_v1_0.csv')

In [3]:
dataDF.head()

Unnamed: 0.1,Unnamed: 0,GEOID,NAME,State_Name,STATEFP,COUNTYFP,date_end_period,date_start_period,date_end_lag,date_start_lag,...,pct_delivery_behavior_devices_slope_T_4,distance_traveled_from_home_slope_T_4,median_home_dwell_time_slope_T_4,target_date_2wk,LOG_DELTA_INC_RATE_T_14,target_date_3wk,LOG_DELTA_INC_RATE_T_21,target_date_4wk,LOG_DELTA_INC_RATE_T_28,PCT_65_OVE
0,133429,1001,Autauga,Alabama,1,1,2020-04-04,2020-03-29,2020-03-28,2020-03-01,...,-0.161042,-0.521814,0.835106,2020-04-11,1.051994,2020-04-18,1.461548,2020-04-25,1.782225,14.583333
1,133430,1003,Baldwin,Alabama,1,3,2020-04-04,2020-03-29,2020-03-28,2020-03-01,...,0.32433,0.938332,0.376323,2020-04-11,1.157806,2020-04-18,1.643584,2020-04-25,1.951729,19.540429
2,133431,1005,Barbour,Alabama,1,5,2020-04-04,2020-03-29,2020-03-28,2020-03-01,...,-0.520546,-0.987679,0.917133,2020-04-11,1.202133,2020-04-18,1.798781,2020-04-25,2.477857,17.97378
3,133432,1007,Bibb,Alabama,1,7,2020-04-04,2020-03-29,2020-03-28,2020-03-01,...,-0.872921,-0.980646,0.951871,2020-04-11,1.569648,2020-04-18,2.264319,2020-04-25,2.665892,16.251609
4,133433,1009,Blount,Alabama,1,9,2020-04-04,2020-03-29,2020-03-28,2020-03-01,...,0.604771,-0.977453,0.983202,2020-04-11,1.015063,2020-04-18,1.360854,2020-04-25,1.771914,17.751756


In [4]:
dataDF.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,136532.0,68265.500000,39413.537814,0.000000,34132.750000,68265.500000,102398.250000,136531.000000
GEOID,136532.0,30666.717370,14991.885999,1001.000000,19041.000000,29209.000000,46011.000000,56045.000000
STATEFP,136532.0,30.563326,14.972619,1.000000,19.000000,29.000000,46.000000,56.000000
COUNTYFP,136532.0,103.391557,107.885166,1.000000,35.000000,79.000000,133.000000,840.000000
LOG_DELTA_INC_RATE_T,136532.0,2.091471,1.293111,0.000000,1.000221,2.150752,3.174741,6.695893
...,...,...,...,...,...,...,...,...
median_home_dwell_time_slope_T_4,136532.0,0.049920,0.779986,-0.999803,-0.821886,0.178972,0.861735,0.999741
LOG_DELTA_INC_RATE_T_14,136532.0,2.758683,1.446826,0.000000,1.662610,2.926603,3.961312,7.230831
LOG_DELTA_INC_RATE_T_21,136532.0,3.182626,1.496465,0.000000,2.127386,3.407263,4.406482,7.365045
LOG_DELTA_INC_RATE_T_28,136532.0,3.502590,1.509591,0.000000,2.495811,3.759050,4.719045,7.568401


In [5]:
dataDF.isnull().sum().sum()

0

In [6]:
# Replace infinite values with null values. Replacing infinite values from here to next markdown was inspired by code out of Spatiotemporal prediction of COVID-19 cases using inter- and intra-county proxies of humans
# interactions paper.
dataDF = dataDF.replace([np.inf,-np.inf], np.NaN)

In [7]:
dataDF.isna().sum().sum()

176

In [8]:
for col in dataDF.columns[dataDF.isna().any()].tolist():
    dataDF[col] = dataDF.groupby(['date_start_period','STATEFP'])[col].transform(lambda x: x.fillna(x.mean()))

In [9]:
dataDF.isna().sum().sum()

0

We will now move on to splitting up the data for the 4 week prediction differences

In [10]:
# Drop non numerical data from our input features
X = dataDF.drop(columns=["NAME","State_Name","STATEFP","COUNTYFP","GEOID","Unnamed: 0","target_date_4wk",
                         "target_date_3wk","target_date_2wk","date_end_period","date_start_period","date_end_lag","date_start_lag"])

In [11]:
y = dataDF[['LOG_DELTA_INC_RATE_T','LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28']]

In [12]:
y.head()

Unnamed: 0,LOG_DELTA_INC_RATE_T,LOG_DELTA_INC_RATE_T_14,LOG_DELTA_INC_RATE_T_21,LOG_DELTA_INC_RATE_T_28
0,0.684831,1.051994,1.461548,1.782225
1,0.608396,1.157806,1.643584,1.951729
2,0.244551,1.202133,1.798781,2.477857
3,0.73144,1.569648,2.264319,2.665892
4,0.611175,1.015063,1.360854,1.771914


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=24)

In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(109225, 143)
(27307, 143)
(109225, 4)
(27307, 4)


In [15]:
# Make seperate train and tests where the target variables get dropped. For example if I am doing one week prediction I don't want to have as
# inputs week 1, week 2, week 3 or week 4 data. But if I am doing week 3 predictions I do want to be able to see as input features week 1 and week 2 but not
# weeks 3 and 4. This pattern is the same for week 2 and 4
X_train_wk1Pred = X_train.drop(columns=['LOG_DELTA_INC_RATE_T','LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28'])
X_test_wk1Pred = X_test.drop(columns=['LOG_DELTA_INC_RATE_T','LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28'])
X_train_wk2Pred = X_train.drop(columns=['LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28'])
X_test_wk2Pred = X_test.drop(columns=['LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28'])
X_train_wk3Pred = X_train.drop(columns=['LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28'])
X_test_wk3Pred = X_test.drop(columns=['LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28'])
X_train_wk4Pred = X_train.drop(columns=['LOG_DELTA_INC_RATE_T_28'])
X_test_wk4Pred = X_test.drop(columns=['LOG_DELTA_INC_RATE_T_28'])

In [16]:
print(X_train_wk1Pred.shape)
print(X_test_wk1Pred.shape)
print(X_train_wk2Pred.shape)
print(X_test_wk2Pred.shape)
print(X_train_wk3Pred.shape)
print(X_test_wk3Pred.shape)
print(X_train_wk4Pred.shape)
print(X_test_wk4Pred.shape)

(109225, 139)
(27307, 139)
(109225, 140)
(27307, 140)
(109225, 141)
(27307, 141)
(109225, 142)
(27307, 142)


In [17]:
# Scale each prediction set
scalerWk1 = StandardScaler()
scalerWk2 = StandardScaler()
scalerWk3 = StandardScaler()
scalerWk4 = StandardScaler()

X_train_wk1Pred_std = scalerWk1.fit_transform(X_train_wk1Pred)
X_test_wk1Pred_std = scalerWk1.transform(X_test_wk1Pred)

X_train_wk2Pred_std = scalerWk2.fit_transform(X_train_wk2Pred)
X_test_wk2Pred_std = scalerWk2.transform(X_test_wk2Pred)

X_train_wk3Pred_std = scalerWk3.fit_transform(X_train_wk3Pred)
X_test_wk3Pred_std = scalerWk3.transform(X_test_wk3Pred)

X_train_wk4Pred_std = scalerWk4.fit_transform(X_train_wk4Pred)
X_test_wk4Pred_std = scalerWk4.transform(X_test_wk4Pred)

In [18]:
print(np.isinf(X_train_wk1Pred_std).sum())
print(np.isinf(X_test_wk1Pred_std).sum())

print(np.isinf(X_train_wk2Pred_std).sum())
print(np.isinf(X_test_wk2Pred_std).sum())

print(np.isinf(X_train_wk3Pred_std).sum())
print(np.isinf(X_test_wk3Pred_std).sum())

print(np.isinf(X_train_wk4Pred_std).sum())
print(np.isinf(X_test_wk4Pred_std).sum())

0
0
0
0
0
0
0
0


In [19]:
# Set up y aka the predicted feature for each week. It should drop the other weeks
y_train_wk1Pred = y_train.drop(columns=['LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28'])
y_test_wk1Pred = y_test.drop(columns=['LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28'])

y_train_wk2Pred = y_train.drop(columns=['LOG_DELTA_INC_RATE_T','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28'])
y_test_wk2Pred = y_test.drop(columns=['LOG_DELTA_INC_RATE_T','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T_28'])

y_train_wk3Pred = y_train.drop(columns=['LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T','LOG_DELTA_INC_RATE_T_28'])
y_test_wk3Pred = y_test.drop(columns=['LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T','LOG_DELTA_INC_RATE_T_28'])

y_train_wk4Pred = y_train.drop(columns=['LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T'])
y_test_wk4Pred = y_test.drop(columns=['LOG_DELTA_INC_RATE_T_14','LOG_DELTA_INC_RATE_T_21','LOG_DELTA_INC_RATE_T'])

In [20]:
# Check to make sure that each one has the correct column left
print(y_train_wk1Pred.columns)
print(y_test_wk1Pred.columns)
print(y_train_wk2Pred.columns)
print(y_test_wk2Pred.columns)
print(y_train_wk3Pred.columns)
print(y_test_wk3Pred.columns)
print(y_train_wk4Pred.columns)
print(y_test_wk4Pred.columns)

Index(['LOG_DELTA_INC_RATE_T'], dtype='object')
Index(['LOG_DELTA_INC_RATE_T'], dtype='object')
Index(['LOG_DELTA_INC_RATE_T_14'], dtype='object')
Index(['LOG_DELTA_INC_RATE_T_14'], dtype='object')
Index(['LOG_DELTA_INC_RATE_T_21'], dtype='object')
Index(['LOG_DELTA_INC_RATE_T_21'], dtype='object')
Index(['LOG_DELTA_INC_RATE_T_28'], dtype='object')
Index(['LOG_DELTA_INC_RATE_T_28'], dtype='object')


In [21]:
# Set up a list that pairs the scaled training input data with the training output as each category of list
trainingList = [(1,X_train_wk1Pred_std,y_train_wk1Pred),(2,X_train_wk2Pred_std,y_train_wk2Pred),(3,X_train_wk3Pred_std,y_train_wk3Pred),(4,X_train_wk4Pred_std,y_train_wk4Pred)]

In [22]:
"""
# This cell helps to figure out what the best hyperparameters for each week predictions Random Forest are. I have commented it out
# because it takes a long time to run (1 hr 42 mins). The best parameters are used below but if you want to double check remove the quotes and run this cell
for w,x,y in trainingList:
  rf_regressor = RandomForestRegressor(random_state=24)
  rf_params = {'max_depth':np.arange(1,15,1),
             'min_samples_split':np.arange(2,50,1),
             'min_samples_leaf':np.arange(2,50,1),
             'max_features':['sqrt', 'log2','mse','mae']}
  rf_regressor_cv = RandomizedSearchCV(rf_regressor,rf_params,cv=5,scoring='neg_root_mean_squared_error',random_state=24)
  rf_regressor_cv.fit(x,y)
  print("Week: "+str(w))
  print(rf_regressor_cv.best_params_)
  """

'\n# This cell helps to figure out what the best hyperparameters for each week predictions Random Forest are. I have commented it out\n# because it takes a long time to run (1 hr 42 mins). The best parameters are used below but if you want to double check remove the quotes and run this cell\nfor w,x,y in trainingList:\n  rf_regressor = RandomForestRegressor(random_state=24)\n  rf_params = {\'max_depth\':np.arange(1,15,1),\n             \'min_samples_split\':np.arange(2,50,1),\n             \'min_samples_leaf\':np.arange(2,50,1),\n             \'max_features\':[\'sqrt\', \'log2\',\'mse\',\'mae\']}\n  rf_regressor_cv = RandomizedSearchCV(rf_regressor,rf_params,cv=5,scoring=\'neg_root_mean_squared_error\',random_state=24)\n  rf_regressor_cv.fit(x,y)\n  print("Week: "+str(w))\n  print(rf_regressor_cv.best_params_)\n  '

In [22]:
# Fit the Random Forest Regressor's to each set of training data with the best parameters.
np.random.seed(24)
best_rf_regressor1 = RandomForestRegressor(max_depth=13,min_samples_split=36,min_samples_leaf=10,max_features='sqrt')
best_rf_regressor1.fit(X_train_wk1Pred_std,y_train_wk1Pred)

best_rf_regressor2 = RandomForestRegressor(max_depth=13,min_samples_split=36,min_samples_leaf=10,max_features='sqrt')
best_rf_regressor2.fit(X_train_wk2Pred_std,y_train_wk2Pred)

best_rf_regressor3 = RandomForestRegressor(max_depth=13,min_samples_split=36,min_samples_leaf=10,max_features='sqrt')
best_rf_regressor3.fit(X_train_wk3Pred_std,y_train_wk3Pred)

best_rf_regressor4 = RandomForestRegressor(max_depth=13,min_samples_split=36,min_samples_leaf=10,max_features='sqrt')
best_rf_regressor4.fit(X_train_wk4Pred_std,y_train_wk4Pred)

In [23]:
# This prints out all the evaluation information for Random Forest's using mean absolute error and root mean squared error
print("For 1 week predictions: ")
rf_y_train_predictedWeek1 = best_rf_regressor1.predict(X_train_wk1Pred_std)
rmse_train_rf1 = (np.sqrt(mean_squared_error(y_train_wk1Pred, rf_y_train_predictedWeek1)))
print('RMSE on the training set for the 1 week prediction Random Forest model is: '+str(rmse_train_rf1))
mae_train_rf1 = mean_absolute_error(y_train_wk1Pred,rf_y_train_predictedWeek1)
print('MAE on the training set for the 1 week prediction Random Forest Model is: '+str(mae_train_rf1))
rf_y_test_predictedWeek1 = best_rf_regressor1.predict(X_test_wk1Pred_std)
rmse_test_rf1 = (np.sqrt(mean_squared_error(y_test_wk1Pred, rf_y_test_predictedWeek1)))
print('RMSE on the testing set for the 1 week prediction Random Forest model is: '+str(rmse_test_rf1))
mae_test_rf1 = mean_absolute_error(y_test_wk1Pred,rf_y_test_predictedWeek1)
print('MAE on the testing set for the 1 week prediction Random Forest Model is: '+str(mae_test_rf1))

print("For 2 week predictions: ")
rf_y_train_predictedWeek2 = best_rf_regressor2.predict(X_train_wk2Pred_std)
rmse_train_rf2 = (np.sqrt(mean_squared_error(y_train_wk2Pred, rf_y_train_predictedWeek2)))
print('RMSE on the training set for the 2 week prediction Random Forest model is: '+str(rmse_train_rf2))
mae_train_rf2 = mean_absolute_error(y_train_wk2Pred,rf_y_train_predictedWeek2)
print('MAE on the training set for the 2 week prediction Random Forest Model is: '+str(mae_train_rf2))
rf_y_test_predictedWeek2 = best_rf_regressor2.predict(X_test_wk2Pred_std)
rmse_test_rf2 = (np.sqrt(mean_squared_error(y_test_wk2Pred, rf_y_test_predictedWeek2)))
print('RMSE on the testing set for the 2 week prediction Random Forest model is: '+str(rmse_test_rf2))
mae_test_rf2 = mean_absolute_error(y_test_wk2Pred,rf_y_test_predictedWeek2)
print('MAE on the testing set for the 2 week prediction Random Forest Model is: '+str(mae_test_rf2))

print("For 3 week predictions: ")
rf_y_train_predictedWeek3 = best_rf_regressor3.predict(X_train_wk3Pred_std)
rmse_train_rf3 = (np.sqrt(mean_squared_error(y_train_wk3Pred, rf_y_train_predictedWeek3)))
print('RMSE on the training set for the 3 week prediction Random Forest model is: '+str(rmse_train_rf3))
mae_train_rf3 = mean_absolute_error(y_train_wk3Pred,rf_y_train_predictedWeek3)
print('MAE on the training set for the 3 week prediction Random Forest Model is: '+str(mae_train_rf3))
rf_y_test_predictedWeek3 = best_rf_regressor3.predict(X_test_wk3Pred_std)
rmse_test_rf3 = (np.sqrt(mean_squared_error(y_test_wk3Pred, rf_y_test_predictedWeek3)))
print('RMSE on the testing set for the 3 week prediction Random Forest model is: '+str(rmse_test_rf3))
mae_test_rf3 = mean_absolute_error(y_test_wk3Pred,rf_y_test_predictedWeek3)
print('MAE on the testing set for the 3 week prediction Random Forest Model is: '+str(mae_test_rf3))

print("For 4 week predictions: ")
rf_y_train_predictedWeek4 = best_rf_regressor4.predict(X_train_wk4Pred_std)
rmse_train_rf4 = (np.sqrt(mean_squared_error(y_train_wk4Pred, rf_y_train_predictedWeek4)))
print('RMSE on the training set for the 4 week prediction Random Forest model is: '+str(rmse_train_rf4))
mae_train_rf4 = mean_absolute_error(y_train_wk4Pred,rf_y_train_predictedWeek4)
print('MAE on the training set for the 4 week prediction Random Forest Model is: '+str(mae_train_rf4))
rf_y_test_predictedWeek4 = best_rf_regressor4.predict(X_test_wk4Pred_std)
rmse_test_rf4 = (np.sqrt(mean_squared_error(y_test_wk4Pred, rf_y_test_predictedWeek4)))
print('RMSE on the testing set for the 4 week prediction Random Forest model is: '+str(rmse_test_rf4))
mae_test_rf4 = mean_absolute_error(y_test_wk4Pred,rf_y_test_predictedWeek4)
print('MAE on the testing set for the 4 week prediction Random Forest Model is: '+str(mae_test_rf4))

For 1 week predictions: 
RMSE on the training set for the 1 week prediction Random Forest model is: 0.4208792716349861
MAE on the training set for the 1 week prediction Random Forest Model is: 0.30123659389849894
RMSE on the testing set for the 1 week prediction Random Forest model is: 0.48156363537116015
MAE on the testing set for the 1 week prediction Random Forest Model is: 0.34007149142438403
For 2 week predictions: 
RMSE on the training set for the 2 week prediction Random Forest model is: 0.3542538789683164
MAE on the training set for the 2 week prediction Random Forest Model is: 0.2460956801001333
RMSE on the testing set for the 2 week prediction Random Forest model is: 0.4110796631574586
MAE on the testing set for the 2 week prediction Random Forest Model is: 0.28187564034198276
For 3 week predictions: 
RMSE on the training set for the 3 week prediction Random Forest model is: 0.31509793412539483
MAE on the training set for the 3 week prediction Random Forest Model is: 0.215805

Stochastic Gradient Boosting

In [29]:
'''
# This cell helps to figure out what the best hyperparameters for each week predictions Stochastic Gradient Boosting are. I have commented it out
# because it takes a long time to run. The best parameters are used below but if you want to double check remove the quotes and run this cell
for w,x,y in trainingList:
  sgb_regressor = GradientBoostingRegressor(random_state=24)
  sgb_params = dict(learning_rate=np.arange(0.05,0.3,0.05),
                      n_estimators=np.arange(100,1000,100),
                      subsample = np.arange(0.1,0.9,0.05),
                      max_depth=[int(i) for i in np.arange(1,10,1)],
                      max_features = ['sqrt', 'log2'])

    sgb_cv = RandomizedSearchCV(sgb_regressor, sgb_params, random_state=24, cv=5, scoring='neg_root_mean_squared_error')
    sgb_optimized = sgb_cv.fit(x, y)

    print("Week: "+str(w))
    print(sgb_optimized.best_params_)
'''

Skipping
Week: 2
{'subsample': 0.8500000000000002, 'n_estimators': 200, 'max_features': 'log2', 'max_depth': 8, 'learning_rate': 0.15000000000000002}
Week: 3
{'subsample': 0.8500000000000002, 'n_estimators': 200, 'max_features': 'log2', 'max_depth': 8, 'learning_rate': 0.15000000000000002}
Week: 4
{'subsample': 0.6500000000000001, 'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': 8, 'learning_rate': 0.25}


In [30]:
# Fit the Gradient Boosting Regressor's to each set of training data with the best parameters.
np.random.seed(24)
best_sgb_regressor1 = GradientBoostingRegressor(subsample=0.8500000000000002,n_estimators=200,max_features='log2',max_depth=8,learning_rate=0.15000000000000002)
best_sgb_regressor1.fit(X_train_wk1Pred_std,y_train_wk1Pred)

best_sgb_regressor2 = GradientBoostingRegressor(subsample=0.8500000000000002,n_estimators=200,max_features='log2',max_depth=8,learning_rate=0.15000000000000002)
best_sgb_regressor2.fit(X_train_wk2Pred_std,y_train_wk2Pred)

best_sgb_regressor3 = GradientBoostingRegressor(subsample=0.8500000000000002,n_estimators=200,max_features='log2',max_depth=8,learning_rate=0.15000000000000002)
best_sgb_regressor3.fit(X_train_wk3Pred_std,y_train_wk3Pred)

best_sgb_regressor4 = GradientBoostingRegressor(subsample=0.6500000000000001,n_estimators=200,max_features='sqrt',max_depth=8,learning_rate=0.25)
best_sgb_regressor4.fit(X_train_wk4Pred_std,y_train_wk4Pred)

In [31]:
# This prints out all the evaluation information for Gradient Boosting's using mean absolute error and root mean squared error
print("For 1 week predictions: ")
sgb_y_train_predictedWeek1 = best_sgb_regressor1.predict(X_train_wk1Pred_std)
rmse_train_sgb1 = (np.sqrt(mean_squared_error(y_train_wk1Pred, sgb_y_train_predictedWeek1)))
print('RMSE on the training set for the 1 week prediction Stochastic Gradient Boosting model is: '+str(rmse_train_sgb1))
mae_train_sgb1 = mean_absolute_error(y_train_wk1Pred,sgb_y_train_predictedWeek1)
print('MAE on the training set for the 1 week prediction Stochastic Gradient Boosting Model is: '+str(mae_train_sgb1))
sgb_y_test_predictedWeek1 = best_sgb_regressor1.predict(X_test_wk1Pred_std)
rmse_test_sgb1 = (np.sqrt(mean_squared_error(y_test_wk1Pred, sgb_y_test_predictedWeek1)))
print('RMSE on the testing set for the 1 week prediction Stochastic Gradient Boosting model is: '+str(rmse_test_sgb1))
mae_test_sgb1 = mean_absolute_error(y_test_wk1Pred,sgb_y_test_predictedWeek1)
print('MAE on the testing set for the 1 week prediction Stochastic Gradient Boosting Model is: '+str(mae_test_sgb1))

print("For 2 week predictions: ")
sgb_y_train_predictedWeek2 = best_sgb_regressor2.predict(X_train_wk2Pred_std)
rmse_train_sgb2 = (np.sqrt(mean_squared_error(y_train_wk2Pred, sgb_y_train_predictedWeek2)))
print('RMSE on the training set for the 2 week prediction Stochastic Gradient Boosting model is: '+str(rmse_train_sgb2))
mae_train_sgb2 = mean_absolute_error(y_train_wk2Pred,sgb_y_train_predictedWeek2)
print('MAE on the training set for the 2 week prediction Stochastic Gradient Boosting Model is: '+str(mae_train_sgb2))
sgb_y_test_predictedWeek2 = best_sgb_regressor2.predict(X_test_wk2Pred_std)
rmse_test_sgb2 = (np.sqrt(mean_squared_error(y_test_wk2Pred, sgb_y_test_predictedWeek2)))
print('RMSE on the testing set for the 2 week prediction Stochastic Gradient Boosting model is: '+str(rmse_test_sgb2))
mae_test_sgb2 = mean_absolute_error(y_test_wk2Pred,sgb_y_test_predictedWeek2)
print('MAE on the testing set for the 2 week prediction Stochastic Gradient Boosting Model is: '+str(mae_test_sgb2))

print("For 3 week predictions: ")
sgb_y_train_predictedWeek3 = best_sgb_regressor3.predict(X_train_wk3Pred_std)
rmse_train_sgb3 = (np.sqrt(mean_squared_error(y_train_wk3Pred, sgb_y_train_predictedWeek3)))
print('RMSE on the training set for the 3 week prediction Stochastic Gradient Boosting model is: '+str(rmse_train_sgb3))
mae_train_sgb3 = mean_absolute_error(y_train_wk3Pred,sgb_y_train_predictedWeek3)
print('MAE on the training set for the 3 week prediction Stochastic Gradient Boosting Model is: '+str(mae_train_sgb3))
sgb_y_test_predictedWeek3 = best_sgb_regressor3.predict(X_test_wk3Pred_std)
rmse_test_sgb3 = (np.sqrt(mean_squared_error(y_test_wk3Pred, sgb_y_test_predictedWeek3)))
print('RMSE on the testing set for the 3 week prediction Stochastic Gradient Boosting model is: '+str(rmse_test_sgb3))
mae_test_sgb3 = mean_absolute_error(y_test_wk3Pred,sgb_y_test_predictedWeek3)
print('MAE on the testing set for the 3 week prediction Stochastic Gradient Boosting Model is: '+str(mae_test_sgb3))

print("For 4 week predictions: ")
sgb_y_train_predictedWeek4 = best_sgb_regressor4.predict(X_train_wk4Pred_std)
rmse_train_sgb4 = (np.sqrt(mean_squared_error(y_train_wk4Pred, sgb_y_train_predictedWeek4)))
print('RMSE on the training set for the 4 week prediction Stochastic Gradient Boosting model is: '+str(rmse_train_sgb4))
mae_train_sgb4 = mean_absolute_error(y_train_wk4Pred,sgb_y_train_predictedWeek4)
print('MAE on the training set for the 4 week prediction Stochastic Gradient Boosting Model is: '+str(mae_train_sgb4))
sgb_y_test_predictedWeek4 = best_sgb_regressor4.predict(X_test_wk4Pred_std)
rmse_test_sgb4 = (np.sqrt(mean_squared_error(y_test_wk4Pred, sgb_y_test_predictedWeek4)))
print('RMSE on the testing set for the 4 week prediction Stochastic Gradient Boosting model is: '+str(rmse_test_sgb4))
mae_test_sgb4 = mean_absolute_error(y_test_wk4Pred,sgb_y_test_predictedWeek4)
print('MAE on the testing set for the 4 week prediction Stochastic Gradient Boosting Model is: '+str(mae_test_sgb4))

For 1 week predictions: 
RMSE on the training set for the 1 week prediction Stochastic Gradient Boosting model is: 0.30419854062499907
MAE on the training set for the 1 week prediction Stochastic Gradient Boosting Model is: 0.22618300737896452
RMSE on the testing set for the 1 week prediction Stochastic Gradient Boosting model is: 0.44920285986004427
MAE on the testing set for the 1 week prediction Stochastic Gradient Boosting Model is: 0.3112230886358157
For 2 week predictions: 
RMSE on the training set for the 2 week prediction Stochastic Gradient Boosting model is: 0.20824962067042901
MAE on the training set for the 2 week prediction Stochastic Gradient Boosting Model is: 0.15437970689477215
RMSE on the testing set for the 2 week prediction Stochastic Gradient Boosting model is: 0.3321360519863089
MAE on the testing set for the 2 week prediction Stochastic Gradient Boosting Model is: 0.22517139044679188
For 3 week predictions: 
RMSE on the training set for the 3 week prediction Stoc

AdaBoost

In [33]:
'''
# This cell helps to figure out what the best hyperparameters for each week predictions AdaBoost are. I have commented it out
# because it takes a long time to run. The best parameters are used below but if you want to double check remove the quotes and run this cell
for w,x,y in trainingList:
  ab_regressor = AdaBoostRegressor(random_state=24)
  ab_params = dict(learning_rate=np.arange(0.01,1.0,0.05),
                  n_estimators=np.arange(10,200,10),
                  loss = ['linear', 'square', 'exponential'])
    ab_cv = RandomizedSearchCV(ab_regressor,ab_params,random_state=24,cv=5,scoring='neg_root_mean_squared_error')
    ab_optimized = ab_cv.fit(x,y)

    print("Week: "+str(w))
    print(ab_optimized.best_params_)
'''

'\n# This cell helps to figure out what the best hyperparameters for each week predictions AdaBoost are. I have commented it out\n# because it takes a long time to run. The best parameters are used below but if you want to double check remove the quotes and run this cell\nfor w,x,y in trainingList:\n  ab_regressor = AdaBoostRegressor(random_state=24)\n  ab_params = dict(learning_rate=np.arange(0.01,1.0,0.05),\n                  n_estimators=np.arange(10,200,10),\n                  loss = [\'linear\', \'square\', \'exponential\'])\n    ab_cv = RandomizedSearchCV(ab_regressor,ab_params,random_state=24,cv=5,scoring=\'neg_root_mean_squared_error\')\n    ab_optimized = ab_cv.fit(x,y)\n\n    print("Week: "+str(w))\n    print(ab_optimized.best_params_)\n'

In [26]:
# Fit the AdaBoost Regressor's to each set of training data with the best parameters.
np.random.seed(24)
best_ab_regressor1 = AdaBoostRegressor(random_state=24,learning_rate=0.31000000000000005,loss='linear',n_estimators=20)
best_ab_regressor1.fit(X_train_wk1Pred_std,y_train_wk1Pred)

best_ab_regressor2 = AdaBoostRegressor(random_state=24,learning_rate=0.31000000000000005,loss='linear',n_estimators=20)
best_ab_regressor2.fit(X_train_wk2Pred_std,y_train_wk2Pred)

best_ab_regressor3 = AdaBoostRegressor(random_state=24,learning_rate=0.31000000000000005,loss='linear',n_estimators=20)
best_ab_regressor3.fit(X_train_wk3Pred_std,y_train_wk3Pred)

best_ab_regressor4 = AdaBoostRegressor(random_state=24,learning_rate=0.31000000000000005,loss='linear',n_estimators=20)
best_ab_regressor4.fit(X_train_wk4Pred_std,y_train_wk4Pred)

In [27]:
# This prints out all the evaluation information for AdaBoost's using mean absolute error and root mean squared error
print("For 1 week predictions: ")
ab_y_train_predictedWeek1 = best_ab_regressor1.predict(X_train_wk1Pred_std)
rmse_train_ab1 = (np.sqrt(mean_squared_error(y_train_wk1Pred, ab_y_train_predictedWeek1)))
print('RMSE on the training set for the 1 week prediction AdaBoost model is: '+str(rmse_train_ab1))
mae_train_ab1 = mean_absolute_error(y_train_wk1Pred,ab_y_train_predictedWeek1)
print('MAE on the training set for the 1 week prediction AdaBoost Model is: '+str(mae_train_ab1))
ab_y_test_predictedWeek1 = best_ab_regressor1.predict(X_test_wk1Pred_std)
rmse_test_ab1 = (np.sqrt(mean_squared_error(y_test_wk1Pred, ab_y_test_predictedWeek1)))
print('RMSE on the testing set for the 1 week prediction AdaBoost model is: '+str(rmse_test_ab1))
mae_test_ab1 = mean_absolute_error(y_test_wk1Pred,ab_y_test_predictedWeek1)
print('MAE on the testing set for the 1 week prediction AdaBoost Model is: '+str(mae_test_ab1))

print("For 2 week predictions: ")
ab_y_train_predictedWeek2 = best_ab_regressor2.predict(X_train_wk2Pred_std)
rmse_train_ab2 = (np.sqrt(mean_squared_error(y_train_wk2Pred, ab_y_train_predictedWeek2)))
print('RMSE on the training set for the 2 week prediction AdaBoost model is: '+str(rmse_train_ab2))
mae_train_ab2 = mean_absolute_error(y_train_wk2Pred, ab_y_train_predictedWeek2)
print('MAE on the training set for the 2 week prediction AdaBoost Model is: '+str(mae_train_ab2))
ab_y_test_predictedWeek2 = best_ab_regressor2.predict(X_test_wk2Pred_std)
rmse_test_ab2 = (np.sqrt(mean_squared_error(y_test_wk2Pred, ab_y_test_predictedWeek2)))
print('RMSE on the testing set for the 2 week prediction AdaBoost model is: '+str(rmse_test_ab2))
mae_test_ab2 = mean_absolute_error(y_test_wk2Pred, ab_y_test_predictedWeek2)
print('MAE on the testing set for the 2 week prediction AdaBoost Model is: '+str(mae_test_ab2))

print("For 3 week predictions: ")
ab_y_train_predictedWeek3 = best_ab_regressor3.predict(X_train_wk3Pred_std)
rmse_train_ab3 = (np.sqrt(mean_squared_error(y_train_wk3Pred, ab_y_train_predictedWeek3)))
print('RMSE on the training set for the 3 week prediction AdaBoost model is: '+str(rmse_train_ab3))
mae_train_ab3 = mean_absolute_error(y_train_wk3Pred, ab_y_train_predictedWeek3)
print('MAE on the training set for the 3 week prediction AdaBoost Model is: '+str(mae_train_ab3))
ab_y_test_predictedWeek3 = best_ab_regressor3.predict(X_test_wk3Pred_std)
rmse_test_ab3 = (np.sqrt(mean_squared_error(y_test_wk3Pred, ab_y_test_predictedWeek3)))
print('RMSE on the testing set for the 3 week prediction AdaBoost model is: '+str(rmse_test_ab3))
mae_test_ab3 = mean_absolute_error(y_test_wk3Pred, ab_y_test_predictedWeek3)
print('MAE on the testing set for the 3 week prediction AdaBoost Model is: '+str(mae_test_ab3))

print("For 4 week predictions: ")
ab_y_train_predictedWeek4 = best_ab_regressor4.predict(X_train_wk4Pred_std)
rmse_train_ab4 = (np.sqrt(mean_squared_error(y_train_wk4Pred, ab_y_train_predictedWeek4)))
print('RMSE on the training set for the 4 week prediction AdaBoost model is: '+str(rmse_train_ab4))
mae_train_ab4 = mean_absolute_error(y_train_wk4Pred, ab_y_train_predictedWeek4)
print('MAE on the training set for the 4 week prediction AdaBoost Model is: '+str(mae_train_ab4))
ab_y_test_predictedWeek4 = best_ab_regressor4.predict(X_test_wk4Pred_std)
rmse_test_ab4 = (np.sqrt(mean_squared_error(y_test_wk4Pred, ab_y_test_predictedWeek4)))
print('RMSE on the testing set for the 4 week prediction AdaBoost model is: '+str(rmse_test_ab4))
mae_test_ab4 = mean_absolute_error(y_test_wk4Pred, ab_y_test_predictedWeek4)
print('MAE on the testing set for the 4 week prediction AdaBoost Model is: '+str(mae_test_ab4))

For 1 week predictions: 
RMSE on the training set for the 1 week prediction AdaBoost model is: 0.49981586283770024
MAE on the training set for the 1 week prediction AdaBoost Model is: 0.3770700708397977
RMSE on the testing set for the 1 week prediction AdaBoost model is: 0.5071273983662828
MAE on the testing set for the 1 week prediction AdaBoost Model is: 0.38028318732276145
For 2 week predictions: 
RMSE on the training set for the 2 week prediction AdaBoost model is: 0.36111134993584715
MAE on the training set for the 2 week prediction AdaBoost Model is: 0.26318778115255576
RMSE on the testing set for the 2 week prediction AdaBoost model is: 0.370915427477803
MAE on the testing set for the 2 week prediction AdaBoost Model is: 0.2679157056735575
For 3 week predictions: 
RMSE on the training set for the 3 week prediction AdaBoost model is: 0.32026652642805115
MAE on the training set for the 3 week prediction AdaBoost Model is: 0.22651384004989894
RMSE on the testing set for the 3 week 