In [1]:
%matplotlib inline
    
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
# final df for model building
df1 = pd.read_csv("ML_model_wk40_to_20.csv")
df1.dropna(how='any', inplace=True)
del df1['ILI_weeks']
del df1['Unnamed: 0']
#del df1['week']
df1.head(2)

Unnamed: 0,Year,week,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,...,walking_pneumonia,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
0,2009,40,44,33,81,69,22,40,35,36,...,47,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927
1,2009,41,51,51,77,46,24,43,35,43,...,53,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349


In [4]:
#  combining year and week column 
def combine_year_week(row):
    return int(row["Year"]) * 100 + int(row["week"])

In [5]:
df1["YearWeek"] = df1.apply(combine_year_week, axis=1)

In [6]:
# reset index to YearWeek
df1.index = df1["YearWeek"]

In [7]:
df1.head()

Unnamed: 0_level_0,Year,week,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,...,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4,YearWeek
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,2009,40,44,33,81,69,22,40,35,36,...,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927,200940
200941,2009,41,51,51,77,46,24,43,35,43,...,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349,200941
200942,2009,42,63,38,88,76,30,48,34,45,...,82,0.02078,0.02626,0.04458,7.61889,7.38836,6.33927,4.94349,3.80996,200942
200943,2009,43,70,51,100,88,29,54,39,55,...,54,0.02862,0.035,0.05885,7.38836,6.33927,4.94349,3.80996,3.44106,200943
200944,2009,44,53,52,96,44,30,55,44,53,...,77,0.02927,0.03515,0.05945,6.33927,4.94349,3.80996,3.44106,2.66773,200944


In [8]:
model_df = df1.drop(["Year", "week", "YearWeek"], axis=1)
model_df.head(2)

Unnamed: 0_level_0,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,cold_or_flu,cold_versus_flu,...,walking_pneumonia,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,44,33,81,69,22,40,35,36,37,30,...,47,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927
200941,51,51,77,46,24,43,35,43,49,41,...,53,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349


In [9]:
model_74important_features_df = model_df[['CDC_Unweighted_ILI',
 'type_a_influenza',
 'influenza_type_a',
 'Flu_Visit_Count',
 'ILI_Visit_Count',
 'how_to_treat_flu',
 'flu_remedy',
 'exposed_to_flu',
 'get_over_the_flu',
 'sinus',
 'cure_the_flu',
 'incubation_period_for_the_flu',
 'treat_the_flu',
 'how_to_treat_the_flu',
 'low_body',
 'influenza_treatment',
 'oscillococcinum',
 'flu_contagious_period',
 'tussionex',
 'flu_care',
 'walking_pneumonia',
 'Unspecified',
 'sinus_infections',
 'flu_germs',
 'pneumonia',
 'flu_complications',
 'symptoms_of_pneumonia',
 'braun_thermoscan',
 'treat_flu',
 'over_the_counter_flu_medicine',
 'acute_bronchitis',
 'robitussin',
 'contagious_flu',
 'thermoscan',
 'fever_flu',
 'expectorant',
 'strep_throat',
 'flu_contagious',
 'flu_incubation',
 'early_flu_symptoms',
 'flu_remedies',
 'flu_headache',
 'chest_cold',
 'cold_versus_flu',
 'cold_and_flu',
 'cure_flu',
 'tussin',
 'flu_medicine',
 'remedies_for_the_flu',
 'get_rid_of_the_flu',
 'over_the_counter_flu',
 'tessalon',
 'influenza_incubation_period',
 'ear_thermometer',
 'flu_children',
 'flu_and_fever',
 'flu_treatments',
 'break_a_fever',
 'reduce_a_fever',
 'flu_vs_cold',
 'treating_the_flu',
 'taking_temperature',
 'flu_versus_cold',
 'symptoms_of_the_flu',
 'influenza_symptoms',
 'bronchitis',
 'remedies_for_flu',
 'flu_cough',
 'high_fever',
 'cold_vs_flu',
 'signs_of_the_flu',
 'influenza_a_and_b',
 'flu_length',
 'body_temperature', 
 'ILI_lagwk1',
 'ILI_lagwk2', 
 'ILI_lagwk3',
 'ILI_lagwk4']]

In [10]:
model_74important_features_df.head()

Unnamed: 0_level_0,CDC_Unweighted_ILI,type_a_influenza,influenza_type_a,Flu_Visit_Count,ILI_Visit_Count,how_to_treat_flu,flu_remedy,exposed_to_flu,get_over_the_flu,sinus,...,high_fever,cold_vs_flu,signs_of_the_flu,influenza_a_and_b,flu_length,body_temperature,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,5.66087,78,78,0.01338,0.01763,50,31,59,28,58,...,54,48,31,40,28,81,6.81522,7.61889,7.38836,6.33927
200941,6.81522,77,77,0.0162,0.02103,32,47,78,44,60,...,61,52,35,35,44,77,7.61889,7.38836,6.33927,4.94349
200942,7.61889,94,94,0.02078,0.02626,66,58,90,40,60,...,74,59,50,21,35,88,7.38836,6.33927,4.94349,3.80996
200943,7.38836,100,100,0.02862,0.035,60,49,99,58,62,...,87,82,50,51,51,100,6.33927,4.94349,3.80996,3.44106
200944,6.33927,77,77,0.02927,0.03515,80,64,99,47,59,...,81,73,57,39,57,96,4.94349,3.80996,3.44106,2.66773


# Independent variables
        # 3 independent variables from athena EHR 
            # [(flu visit counts)/ (total patient visit counts) 
            # (ILI visit counts)/ (total patient visit counts)
            # (unspecified viral or ILI visit counts)/ (total patient visit counts)]
        # CDC historical CDC_Unweighted_ILI values: collected from 2009 to 2016 (week 40 to 20)
        # 74 google search terms related to flu
        
      ======> 3 + 1 + 74
      
# Dependent variables 
        # 4 ILI weeks offset by 1 week

# split df to train dataset, dataset before year2015 and week 40 used at training dataset

In [11]:
train = model_74important_features_df[model_74important_features_df.index < 201540]
# train.head()


# defining  targets/labels to  y axis
                y_train = ILI lag week 1

                yy_train = ILI lag week 2

                yyy_train = ILI lag week 3

                yyyy_train = ILI lag week 4

In [12]:
y_train = train["ILI_lagwk1"]
# y_train

In [13]:
yy_train = train["ILI_lagwk2"]

In [14]:
yyy_train = train["ILI_lagwk3"]

In [15]:
yyyy_train =train["ILI_lagwk4"]

In [16]:
X_train = train.drop(["ILI_lagwk1", "ILI_lagwk2", "ILI_lagwk3", "ILI_lagwk4"], axis=1)
# X_train

# split df to train dataset,dataset after year2015 and week 40 used at training dataset

In [17]:
test = model_74important_features_df[model_74important_features_df.index >= 201540]
# test.head()

In [18]:
y_test = test["ILI_lagwk1"]

In [19]:
yy_test = test["ILI_lagwk2"]

In [20]:
yyy_test = test["ILI_lagwk3"]

In [21]:
yyyy_test = test["ILI_lagwk4"]

In [22]:
X_test = test.drop(["ILI_lagwk1", "ILI_lagwk2", "ILI_lagwk3", "ILI_lagwk4"], axis=1)

# Reference https://shankarmsy.github.io/stories/gbrt-sklearn.html#

##https://www.youtube.com/watch?v=IXZKgIsZRm0

# building a model for y_test = test["ILI_lagwk1"]

In [23]:
rreg1 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)   

In [24]:
rreg1.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [25]:
y_pred = rreg1.predict(X_test)

In [26]:
r2_score(y_test, y_pred)

0.813202656518623

In [27]:
mean_squared_error(y_test, y_pred).round(3)

0.069

In [28]:
mean_absolute_error(y_test, y_pred).round(3)

0.189

In [29]:
print("R-squared for ILI_lagwk1 Train: %.3f" %rreg1.score(X_train, y_train)) 
print("R-squared for ILI_lagwk1 Test : %.3f" %rreg1.score(X_test, y_test)) 

R-squared for ILI_lagwk1 Train: 0.905
R-squared for ILI_lagwk1 Test : 0.813


In [30]:
predictions1 = rreg1.predict(X_test) 

In [31]:
# predict the values of y
predictions = rreg1.predict(X_test)
y_test_unraveled = y_test.values.ravel()
y_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [32]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk1": predictions1, "Actual": y_test_unraveled}).reset_index(drop=True)
pred_df.index = y_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk1,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.375312,1.33029
201541,1.393985,1.41368
201542,1.489888,1.36882
201543,1.438706,1.48309
201544,1.505197,1.54494


In [33]:
# pred_df.to_csv("Prediction_ILI_lagwk1.csv")
# pred_df.head() 

# # Create the GridSearchCV model

In [34]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {'learning_rate':[ 0.02, 0.01, 0.1, 0.05], 
#             'max_depth':[1, 3, 5 ,7], 
#             'min_samples_leaf':[3, 5, 7 ,9], 
# #             'max_features':[0.1,0.3,1.0],
#             'n_estimators': [300, 500, 1000, 2000]
#              } 

# est = GradientBoostingRegressor()
# gs_cv = GridSearchCV(est, param_grid).fit(X_train, y_train)

# # best hyperparameter setting

# gs_cv.best_est 

# selecting important features  ["ILI_lagwk1"]

In [35]:
# # GBRF predictors variable importance
# store most important variables under importances
importances = rreg1.feature_importances_

In [36]:
rreg1.feature_importances_.sum()

0.9999999999999996

In [37]:
# # store most important variables/features under importances
importances = rreg1.feature_importances_
importances

array([8.88928957e-01, 2.95585064e-02, 2.10116064e-02, 8.04763388e-03,
       3.12925744e-03, 3.06531754e-03, 3.01628169e-03, 3.12318423e-03,
       2.81179221e-03, 2.77344571e-03, 2.59061688e-03, 3.09286247e-03,
       2.51914758e-03, 2.44144611e-03, 2.06084423e-03, 1.38055901e-03,
       1.31104054e-03, 1.19766427e-03, 1.18296166e-03, 1.21432809e-03,
       9.47841098e-04, 5.96314026e-04, 8.05995804e-04, 7.90948548e-04,
       6.59730464e-04, 8.32959673e-04, 5.83504991e-04, 6.16255760e-04,
       4.91215729e-04, 4.51464818e-04, 4.31462930e-04, 4.12867358e-04,
       3.09437930e-04, 3.63975307e-04, 5.57822560e-05, 3.49934631e-04,
       3.77882046e-04, 1.53254220e-04, 0.00000000e+00, 2.78563893e-04,
       3.79841475e-04, 3.25739285e-04, 2.24484145e-04, 1.75939652e-04,
       2.58172429e-04, 2.27973606e-04, 2.37609272e-04, 2.49981719e-04,
       2.19670717e-04, 2.13927674e-04, 1.34513334e-04, 2.01128856e-04,
       2.05095535e-04, 1.88012484e-04, 1.88064865e-04, 1.77868898e-04,
      

In [38]:
sorted_features = sorted(zip(importances, X_train.columns), reverse=True)
sorted_features

[(0.88892895726736, 'CDC_Unweighted_ILI'),
 (0.029558506383541048, 'type_a_influenza'),
 (0.02101160637119068, 'influenza_type_a'),
 (0.008047633884935124, 'Flu_Visit_Count'),
 (0.003129257439815969, 'ILI_Visit_Count'),
 (0.00312318423256284, 'exposed_to_flu'),
 (0.00309286247376027, 'incubation_period_for_the_flu'),
 (0.003065317535351497, 'how_to_treat_flu'),
 (0.003016281687537139, 'flu_remedy'),
 (0.0028117922077526007, 'get_over_the_flu'),
 (0.002773445706693761, 'sinus'),
 (0.002590616879802216, 'cure_the_flu'),
 (0.0025191475821987916, 'treat_the_flu'),
 (0.0024414461147424624, 'how_to_treat_the_flu'),
 (0.0020608442335672056, 'low_body'),
 (0.0013805590076392457, 'influenza_treatment'),
 (0.001311040541873167, 'oscillococcinum'),
 (0.0012143280902807336, 'flu_care'),
 (0.0011976642737822813, 'flu_contagious_period'),
 (0.0011829616592513063, 'tussionex'),
 (0.0009478410976431177, 'walking_pneumonia'),
 (0.0008329596734644825, 'flu_complications'),
 (0.0008059958042667919, 'sinu

In [39]:
# # sort important varibles and sotre them under indices
important_columns = []
for fimportance, name in sorted(zip(importances, X_train.columns), reverse=True):
    if fimportance > 0.001:
        important_columns.append(name)

In [40]:
important_feature_columns = important_columns
important_feature_columns

['CDC_Unweighted_ILI',
 'type_a_influenza',
 'influenza_type_a',
 'Flu_Visit_Count',
 'ILI_Visit_Count',
 'exposed_to_flu',
 'incubation_period_for_the_flu',
 'how_to_treat_flu',
 'flu_remedy',
 'get_over_the_flu',
 'sinus',
 'cure_the_flu',
 'treat_the_flu',
 'how_to_treat_the_flu',
 'low_body',
 'influenza_treatment',
 'oscillococcinum',
 'flu_care',
 'flu_contagious_period',
 'tussionex']

In [41]:
important_features20_df = model_df[important_feature_columns]
important_features20_df.head()

Unnamed: 0_level_0,CDC_Unweighted_ILI,type_a_influenza,influenza_type_a,Flu_Visit_Count,ILI_Visit_Count,exposed_to_flu,incubation_period_for_the_flu,how_to_treat_flu,flu_remedy,get_over_the_flu,sinus,cure_the_flu,treat_the_flu,how_to_treat_the_flu,low_body,influenza_treatment,oscillococcinum,flu_care,flu_contagious_period,tussionex
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
200940,5.66087,78,78,0.01338,0.01763,59,40,50,31,28,58,10,28,33,84,36,69,63,45,67
200941,6.81522,77,77,0.0162,0.02103,78,38,32,47,44,60,10,44,39,85,40,67,75,56,64
200942,7.61889,94,94,0.02078,0.02626,90,52,66,58,40,60,17,55,60,89,44,60,92,64,63
200943,7.38836,100,100,0.02862,0.035,99,71,60,49,58,62,20,53,50,99,68,95,100,98,62
200944,6.33927,77,77,0.02927,0.03515,99,70,80,64,47,59,20,68,65,97,46,84,97,100,66


In [42]:
# important_features74_df.to_csv("ML_model_wk40_to_20_with_74_important_features.csv")
# important_features74_df.head()

# building a model for y_test = test["ILI_lagwk2"]

In [43]:
rreg2 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)   

In [44]:
rreg2.fit(X_train, yy_train) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [45]:
yy_pred = rreg2.predict(X_test)

In [46]:
r2_score(yy_test, yy_pred)

0.581665844790752

In [47]:
mean_squared_error(yy_test, yy_pred).round(3)

0.154

In [48]:
mean_absolute_error(yy_test, yy_pred).round(3)

0.317

In [49]:
print("R-squared for ILI_lagwk2 Train: %.3f" %rreg2.score(X_train, yy_train)) 
print("R-squared for ILI_lagwk2 Test: %.3f" %rreg2.score(X_test, yy_test)) 

R-squared for ILI_lagwk2 Train: 0.858
R-squared for ILI_lagwk2 Test: 0.582


In [50]:
predictions2 = rreg2.predict(X_test) 

In [51]:
# predict the values of y
yy_test_unraveled = yy_test.values.ravel()
yy_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [52]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk2": predictions2, "Actual": yy_test_unraveled}).reset_index(drop=True)
pred_df.index = yy_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk2,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.45841,1.41368
201541,1.492654,1.36882
201542,1.562681,1.48309
201543,1.538793,1.54494
201544,1.559991,1.62532


In [53]:
# # Create the GridSearchCV model

# from sklearn.model_selection import GridSearchCV
# param_grid = {'learning_rate':[ 0.02, 0.01, 0.1, 0.05], 
#             'max_depth':[1, 3, 5 ,7], 
#             'min_samples_leaf':[3, 5, 7 ,9], 
# #             'max_features':[0.1,0.3,1.0],
#             'n_estimators': [300, 500, 1000, 2000]
#              } 

# est = GradientBoostingRegressor()
# gs_cv = GridSearchCV(est, param_grid).fit(X_train, y_train)

# # best hyperparameter setting

# gs_cv.best_est 

# building a model for y_test = test["ILI_lagwk3"]

In [54]:
rreg3 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)   

In [55]:
rreg3.fit(X_train, yyy_train) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [56]:
yyy_pred = rreg3.predict(X_test)

In [57]:
r2_score(yyy_test, yyy_pred)

0.42744923889462494

In [58]:
mean_squared_error(yyy_test, yyy_pred).round(3)

0.216

In [59]:
mean_absolute_error(yyy_test, yyy_pred).round(3)

0.35

In [60]:
print("R-squared for Train: %.3f" %rreg3.score(X_train, yyy_train)) 
print("R-squared for Test: %.3f" %rreg3.score(X_test, yyy_test)) 

R-squared for Train: 0.821
R-squared for Test: 0.427


In [61]:
predictions3 = rreg3.predict(X_test) 

In [62]:
# predict the values of y
predictions = rreg3.predict(X_test)
yyy_test_unraveled = yyy_test.values.ravel()
yyy_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [63]:
pred_df = pd.DataFrame({"Prediction_ILI_lagwk3": predictions3, "Actual": yyy_test_unraveled}).reset_index(drop=True)
pred_df.index = yyy_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk3,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.571427,1.36882
201541,1.66377,1.48309
201542,1.709253,1.54494
201543,1.681436,1.62532
201544,1.659081,1.91565


# building a model for y_test = test["ILI_lagwk4"]

In [64]:
rreg4 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)  

In [65]:
rreg4.fit(X_train, yyyy_train) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [66]:
yyyy_pred = rreg4.predict(X_test)

In [67]:
r2_score(yyyy_test, yyyy_pred)

0.2510017249961435

In [68]:
mean_squared_error(yyyy_test, yyyy_pred).round(3)

0.292

In [69]:
mean_absolute_error(yyyy_test, yyyy_pred).round(3)

0.4

In [70]:
print("R-squared for Train: %.3f" %rreg4.score(X_train, yyyy_train)) 
print("R-squared for Test: %.3f" %rreg4.score(X_test, yyyy_test)) 

R-squared for Train: 0.818
R-squared for Test: 0.251


In [71]:
# predict the values of y
predictions4 = rreg4.predict(X_test)
yyyy_test_unraveled = yyyy_test.values.ravel()
yyyy_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [72]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk1": predictions4, "Actual": yyyy_test_unraveled}).reset_index(drop=True)
pred_df.index = yyyy_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk1,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.605964,1.48309
201541,1.829258,1.54494
201542,1.840123,1.62532
201543,1.952775,1.91565
201544,1.804449,1.74368
