In [1]:
%matplotlib inline
    
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
# final df for model building
df1 = pd.read_csv("ML_model_wk40_to_20.csv")
df1.dropna(how='any', inplace=True)
del df1['ILI_weeks']
del df1['Unnamed: 0']
#del df1['week']
df1.head(2)

Unnamed: 0,Year,week,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,...,walking_pneumonia,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
0,2009,40,44,33,81,69,22,40,35,36,...,47,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927
1,2009,41,51,51,77,46,24,43,35,43,...,53,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349


In [4]:
#  combining year and week column 
def combine_year_week(row):
    return int(row["Year"]) * 100 + int(row["week"])

In [5]:
df1["YearWeek"] = df1.apply(combine_year_week, axis=1)

In [6]:
# reset index to YearWeek
df1.index = df1["YearWeek"]

In [7]:
df1.head()

Unnamed: 0_level_0,Year,week,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,...,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4,YearWeek
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,2009,40,44,33,81,69,22,40,35,36,...,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927,200940
200941,2009,41,51,51,77,46,24,43,35,43,...,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349,200941
200942,2009,42,63,38,88,76,30,48,34,45,...,82,0.02078,0.02626,0.04458,7.61889,7.38836,6.33927,4.94349,3.80996,200942
200943,2009,43,70,51,100,88,29,54,39,55,...,54,0.02862,0.035,0.05885,7.38836,6.33927,4.94349,3.80996,3.44106,200943
200944,2009,44,53,52,96,44,30,55,44,53,...,77,0.02927,0.03515,0.05945,6.33927,4.94349,3.80996,3.44106,2.66773,200944


In [8]:
model_df = df1.drop(["Year", "week", "YearWeek"], axis=1)
model_df.head(2)

Unnamed: 0_level_0,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,cold_or_flu,cold_versus_flu,...,walking_pneumonia,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,44,33,81,69,22,40,35,36,37,30,...,47,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927
200941,51,51,77,46,24,43,35,43,49,41,...,53,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349


In [9]:
model_20important_features_df = model_df[[
'CDC_Unweighted_ILI',
 'type_a_influenza',
 'influenza_type_a',
 'Flu_Visit_Count',
 'ILI_Visit_Count',
 'how_to_treat_flu',
 'flu_remedy',
 'exposed_to_flu',
 'get_over_the_flu',
 'sinus',
 'cure_the_flu',
 'incubation_period_for_the_flu',
 'treat_the_flu',
 'how_to_treat_the_flu',
 'low_body',
 'influenza_treatment',
 'oscillococcinum',
 'flu_contagious_period',
 'tussionex',
 'flu_care', 
 'ILI_lagwk1',
 'ILI_lagwk2', 
 'ILI_lagwk3',
 'ILI_lagwk4']]

In [10]:
model_20important_features_df.head()

Unnamed: 0_level_0,CDC_Unweighted_ILI,type_a_influenza,influenza_type_a,Flu_Visit_Count,ILI_Visit_Count,how_to_treat_flu,flu_remedy,exposed_to_flu,get_over_the_flu,sinus,...,low_body,influenza_treatment,oscillococcinum,flu_contagious_period,tussionex,flu_care,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,5.66087,78,78,0.01338,0.01763,50,31,59,28,58,...,84,36,69,45,67,63,6.81522,7.61889,7.38836,6.33927
200941,6.81522,77,77,0.0162,0.02103,32,47,78,44,60,...,85,40,67,56,64,75,7.61889,7.38836,6.33927,4.94349
200942,7.61889,94,94,0.02078,0.02626,66,58,90,40,60,...,89,44,60,64,63,92,7.38836,6.33927,4.94349,3.80996
200943,7.38836,100,100,0.02862,0.035,60,49,99,58,62,...,99,68,95,98,62,100,6.33927,4.94349,3.80996,3.44106
200944,6.33927,77,77,0.02927,0.03515,80,64,99,47,59,...,97,46,84,100,66,97,4.94349,3.80996,3.44106,2.66773


# Independent variables
        # 3 independent variables from athena EHR 
            # [(flu visit counts)/ (total patient visit counts) 
            # (ILI visit counts)/ (total patient visit counts)
            # (unspecified viral or ILI visit counts)/ (total patient visit counts)]
        # CDC historical CDC_Unweighted_ILI values: collected from 2009 to 2016 (week 40 to 20)
        # 74 google search terms related to flu
        
      ======> 3 + 1 + 74
      
# Dependent variables 
        # 4 ILI weeks offset by 1 week

# split df to train dataset, dataset before year2015 and week 40 used at training dataset

In [11]:
train = model_20important_features_df[model_20important_features_df.index < 201540]
# train.head()


# defining  targets/labels to  y axis
                y_train = ILI lag week 1

                yy_train = ILI lag week 2

                yyy_train = ILI lag week 3

                yyyy_train = ILI lag week 4

In [12]:
y_train = train["ILI_lagwk1"]
# y_train

In [13]:
yy_train = train["ILI_lagwk2"]

In [14]:
yyy_train = train["ILI_lagwk3"]

In [15]:
yyyy_train =train["ILI_lagwk4"]

In [16]:
X_train = train.drop(["ILI_lagwk1", "ILI_lagwk2", "ILI_lagwk3", "ILI_lagwk4"], axis=1)
# X_train

# split df to train dataset,dataset after year2015 and week 40 used at training dataset

In [17]:
test = model_20important_features_df[model_20important_features_df.index >= 201540]
# test.head()

In [18]:
y_test = test["ILI_lagwk1"]

In [19]:
yy_test = test["ILI_lagwk2"]

In [20]:
yyy_test = test["ILI_lagwk3"]

In [21]:
yyyy_test = test["ILI_lagwk4"]

In [22]:
X_test = test.drop(["ILI_lagwk1", "ILI_lagwk2", "ILI_lagwk3", "ILI_lagwk4"], axis=1)

# Reference https://shankarmsy.github.io/stories/gbrt-sklearn.html#

##https://www.youtube.com/watch?v=IXZKgIsZRm0

# building a model for y_test = test["ILI_lagwk1"]

In [23]:
rreg1 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)   

In [24]:
rreg1.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [25]:
y_pred = rreg1.predict(X_test)

In [26]:
r2_score(y_test, y_pred)

0.8182288538035303

In [27]:
mean_absolute_error(y_test, y_pred)

0.18742898515842132

In [28]:
print("R-squared for ILI_lagwk1 Train: %.2f" %rreg1.score(X_train, y_train)) 
print("R-squared for ILI_lagwk1 Test : %.2f" %rreg1.score(X_test, y_test)) 

R-squared for ILI_lagwk1 Train: 0.90
R-squared for ILI_lagwk1 Test : 0.82


In [29]:
predictions1 = rreg1.predict(X_test) 

In [30]:
# predict the values of y
predictions = rreg1.predict(X_test)
y_test_unraveled = y_test.values.ravel()
y_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [31]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk1": predictions1, "Actual": y_test_unraveled}).reset_index(drop=True)
pred_df.index = y_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk1,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.333547,1.33029
201541,1.369184,1.41368
201542,1.454505,1.36882
201543,1.409391,1.48309
201544,1.461788,1.54494


In [32]:
# pred_df.to_csv("Prediction_ILI_lagwk1.csv")
# pred_df.head() 

# # Create the GridSearchCV model

In [33]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {'learning_rate':[ 0.02, 0.01, 0.1, 0.05], 
#             'max_depth':[1, 3, 5 ,7], 
#             'min_samples_leaf':[3, 5, 7 ,9], 
# #             'max_features':[0.1,0.3,1.0],
#             'n_estimators': [300, 500, 1000, 2000]
#              } 

# est = GradientBoostingRegressor()
# gs_cv = GridSearchCV(est, param_grid).fit(X_train, y_train)

# # best hyperparameter setting

# gs_cv.best_est 

# selecting important features  ["ILI_lagwk1"]

In [34]:
# # GBRF predictors variable importance
# store most important variables under importances
importances = rreg1.feature_importances_

In [35]:
rreg1.feature_importances_.sum()

1.0000000000000002

In [36]:
# # store most important variables/features under importances
importances = rreg1.feature_importances_
importances

array([0.89743096, 0.0181352 , 0.03260328, 0.00797241, 0.00360066,
       0.00339507, 0.0035753 , 0.00286087, 0.00304767, 0.00427837,
       0.00268555, 0.00329982, 0.00282389, 0.00279063, 0.00306754,
       0.00141192, 0.00188408, 0.00137639, 0.00182723, 0.00193316])

In [37]:
sorted_features = sorted(zip(importances, X_train.columns), reverse=True)
sorted_features

[(0.8974309613919637, 'CDC_Unweighted_ILI'),
 (0.0326032834138166, 'influenza_type_a'),
 (0.018135197614427487, 'type_a_influenza'),
 (0.007972408761806646, 'Flu_Visit_Count'),
 (0.004278369930675321, 'sinus'),
 (0.0036006580096126064, 'ILI_Visit_Count'),
 (0.003575299987868601, 'flu_remedy'),
 (0.003395071375456052, 'how_to_treat_flu'),
 (0.0032998246305314103, 'incubation_period_for_the_flu'),
 (0.003067537269765613, 'low_body'),
 (0.0030476706892958684, 'get_over_the_flu'),
 (0.002860866695377072, 'exposed_to_flu'),
 (0.0028238851948482703, 'treat_the_flu'),
 (0.0027906329778441927, 'how_to_treat_the_flu'),
 (0.0026855493207926643, 'cure_the_flu'),
 (0.0019331638074375575, 'flu_care'),
 (0.001884081880453607, 'oscillococcinum'),
 (0.0018272253786829036, 'tussionex'),
 (0.0014119243160230872, 'influenza_treatment'),
 (0.001376387353320961, 'flu_contagious_period')]

In [38]:
# # sort important varibles and sotre them under indices
important_columns = []
for fimportance, name in sorted(zip(importances, X_train.columns), reverse=True):
    if fimportance > 0.001:
        important_columns.append(name)

In [39]:
important_feature_columns = important_columns
important_feature_columns

['CDC_Unweighted_ILI',
 'influenza_type_a',
 'type_a_influenza',
 'Flu_Visit_Count',
 'sinus',
 'ILI_Visit_Count',
 'flu_remedy',
 'how_to_treat_flu',
 'incubation_period_for_the_flu',
 'low_body',
 'get_over_the_flu',
 'exposed_to_flu',
 'treat_the_flu',
 'how_to_treat_the_flu',
 'cure_the_flu',
 'flu_care',
 'oscillococcinum',
 'tussionex',
 'influenza_treatment',
 'flu_contagious_period']

In [40]:
important_features20_df = model_df[important_feature_columns]
important_features20_df.head()

Unnamed: 0_level_0,CDC_Unweighted_ILI,influenza_type_a,type_a_influenza,Flu_Visit_Count,sinus,ILI_Visit_Count,flu_remedy,how_to_treat_flu,incubation_period_for_the_flu,low_body,get_over_the_flu,exposed_to_flu,treat_the_flu,how_to_treat_the_flu,cure_the_flu,flu_care,oscillococcinum,tussionex,influenza_treatment,flu_contagious_period
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
200940,5.66087,78,78,0.01338,58,0.01763,31,50,40,84,28,59,28,33,10,63,69,67,36,45
200941,6.81522,77,77,0.0162,60,0.02103,47,32,38,85,44,78,44,39,10,75,67,64,40,56
200942,7.61889,94,94,0.02078,60,0.02626,58,66,52,89,40,90,55,60,17,92,60,63,44,64
200943,7.38836,100,100,0.02862,62,0.035,49,60,71,99,58,99,53,50,20,100,95,62,68,98
200944,6.33927,77,77,0.02927,59,0.03515,64,80,70,97,47,99,68,65,20,97,84,66,46,100


In [41]:
# important_features74_df.to_csv("ML_model_wk40_to_20_with_74_important_features.csv")
# important_features74_df.head()

# defining important feature X train and X test dataset

In [42]:
# X_trainim = important_features_df[important_features_df.index < 201540]
# # trainim
# # .head(2)

In [43]:
# X_testim = important_features_df[important_features_df.index >= 201540]
# # testim
# # .head()

In [44]:
# gbrtim_ili_lag1 = GradientBoostingRegressor(n_estimators = 500, max_depth = 5) # number of sequential trees to be modeled

In [45]:
# gbrtim_ili_lag1.fit(X_trainim, y_train) 

In [46]:
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [47]:
# r2_score(y_test, y_pred)

In [48]:
# mean_absolute_error(y_test, y_pred)

In [49]:
# # GBR with important features for ili_lag1 week 1
# print("R-squared for Train gbrtim_ili_lag1: %.2f" %gbrtim_ili_lag1.score(X_trainim, y_train)) 
# print("R-squared for Test gbrtim_ili_lag1: %.2f" %gbrtim_ili_lag1.score(X_testim, y_test)) 

In [50]:
# # predict the values of y with important feature extraction for ili week 1
# predictions = gbrtim_ili_lag1.predict(X_testim)
# y_test_unraveled = y_test.values.ravel()
# y_test.index

In [51]:
# Make predictions using the X_test and y_test data
# Print at least 10 predictions vs their actual labels
# predictions = gbrtim_ili_lag1.predict(X_testim)
# print(f"First 10 Predictions: {predictions[:10]}")
# print(f"First 10 Actual labels: {y_test_unraveled[:10]}")

In [52]:
# # Print predictions vs their actual labels
# pred_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test_unraveled}).reset_index(drop=True)
# pred_df.index = y_test.index
# pred_df.head()

# building a model for y_test = test["ILI_lagwk2"]

In [53]:
rreg2 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)   

In [54]:
rreg2.fit(X_train, yy_train) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [55]:
yy_pred = rreg2.predict(X_test)

In [56]:
r2_score(yy_test, yy_pred)

0.5653113068269501

In [57]:
mean_absolute_error(yy_test, yy_pred)

0.3038711543069374

In [58]:
print("R-squared for ILI_lagwk2 Train: %.2f" %rreg2.score(X_train, yy_train)) 
print("R-squared for ILI_lagwk2 Test: %.2f" %rreg2.score(X_test, yy_test)) 

R-squared for ILI_lagwk2 Train: 0.84
R-squared for ILI_lagwk2 Test: 0.57


In [59]:
predictions2 = rreg2.predict(X_test) 

In [60]:
# predict the values of y
yy_test_unraveled = yy_test.values.ravel()
yy_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [61]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk2": predictions2, "Actual": yy_test_unraveled}).reset_index(drop=True)
pred_df.index = yy_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk2,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.37742,1.41368
201541,1.444279,1.36882
201542,1.50078,1.48309
201543,1.487003,1.54494
201544,1.498026,1.62532


In [62]:
# # Create the GridSearchCV model

# from sklearn.model_selection import GridSearchCV
# param_grid = {'learning_rate':[ 0.02, 0.01, 0.1, 0.05], 
#             'max_depth':[1, 3, 5 ,7], 
#             'min_samples_leaf':[3, 5, 7 ,9], 
# #             'max_features':[0.1,0.3,1.0],
#             'n_estimators': [300, 500, 1000, 2000]
#              } 

# est = GradientBoostingRegressor()
# gs_cv = GridSearchCV(est, param_grid).fit(X_train, y_train)

# # best hyperparameter setting

# gs_cv.best_est 

# selecting important features  ["ILI_lagwk2"]

# defining important feature X train and X test dataset

# building a model for y_test = test["ILI_lagwk3"]

In [63]:
rreg3 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)   

In [64]:
rreg3.fit(X_train, yyy_train) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [65]:
yyy_pred = rreg3.predict(X_test)

In [66]:
r2_score(yyy_test, yyy_pred)

0.3613050755092445

In [67]:
mean_absolute_error(yyy_test, yyy_pred)

0.33474969902135765

In [68]:
print("R-squared for Train: %.2f" %rreg3.score(X_train, yyy_train)) 
print("R-squared for Test: %.2f" %rreg3.score(X_test, yyy_test)) 

R-squared for Train: 0.79
R-squared for Test: 0.36


In [69]:
predictions3 = rreg3.predict(X_test) 

In [70]:
# predict the values of y
predictions = rreg3.predict(X_test)
yyy_test_unraveled = yyy_test.values.ravel()
yyy_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [71]:
pred_df = pd.DataFrame({"Prediction_ILI_lagwk3": predictions3, "Actual": yyy_test_unraveled}).reset_index(drop=True)
pred_df.index = yyy_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk3,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.479204,1.36882
201541,1.508086,1.48309
201542,1.557465,1.54494
201543,1.53634,1.62532
201544,1.551335,1.91565


# building a model for y_test = test["ILI_lagwk4"]

In [72]:
rreg4 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)  

In [73]:
rreg4.fit(X_train, yyyy_train) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [74]:
yyyy_pred = rreg4.predict(X_test)

In [75]:
r2_score(yyyy_test, yyyy_pred)

0.24352061391148994

In [76]:
mean_absolute_error(yyyy_test, yyyy_pred)

0.3779918328292862

In [77]:
print("R-squared for Train: %.2f" %rreg4.score(X_train, yyyy_train)) 
print("R-squared for Test: %.2f" %rreg4.score(X_test, yyyy_test)) 

R-squared for Train: 0.77
R-squared for Test: 0.24


In [78]:
# predict the values of y
predictions4 = rreg4.predict(X_test)
yyyy_test_unraveled = yyyy_test.values.ravel()
yyyy_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [79]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk1": predictions4, "Actual": yyyy_test_unraveled}).reset_index(drop=True)
pred_df.index = yyyy_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk1,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.603582,1.48309
201541,1.621012,1.54494
201542,1.640749,1.62532
201543,1.66374,1.91565
201544,1.651463,1.74368
