In [1]:
%matplotlib inline
    
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor 

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
# final df for model building
df1 = pd.read_csv("ML_model_wk40_to_20.csv")
df1.dropna(how='any', inplace=True)
del df1['ILI_weeks']
del df1['Unnamed: 0']
#del df1['week']
df1.head(2)

Unnamed: 0,Year,week,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,...,walking_pneumonia,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
0,2009,40,44,33,81,69,22,40,35,36,...,47,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927
1,2009,41,51,51,77,46,24,43,35,43,...,53,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349


In [4]:
#  combining year and week column 
def combine_year_week(row):
    return int(row["Year"]) * 100 + int(row["week"])

In [5]:
df1["YearWeek"] = df1.apply(combine_year_week, axis=1)

In [6]:
# reset index to YearWeek
df1.index = df1["YearWeek"]

In [7]:
model_df = df1.drop(["Year", "week", "YearWeek"], axis=1)
model_df.head(2)

Unnamed: 0_level_0,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,cold_or_flu,cold_versus_flu,...,walking_pneumonia,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,44,33,81,69,22,40,35,36,37,30,...,47,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927
200941,51,51,77,46,24,43,35,43,49,41,...,53,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349


In [8]:
model_74important_features_df = model_df[['CDC_Unweighted_ILI',
 'type_a_influenza',
 'influenza_type_a',
 'Flu_Visit_Count',
 'ILI_Visit_Count',
 'how_to_treat_flu',
 'flu_remedy',
 'exposed_to_flu',
 'get_over_the_flu',
 'sinus',
 'cure_the_flu',
 'incubation_period_for_the_flu',
 'treat_the_flu',
 'how_to_treat_the_flu',
 'low_body',
 'influenza_treatment',
 'oscillococcinum',
 'flu_contagious_period',
 'tussionex',
 'flu_care',
 'walking_pneumonia',
 'Unspecified',
 'sinus_infections',
 'flu_germs',
 'pneumonia',
 'flu_complications',
 'symptoms_of_pneumonia',
 'braun_thermoscan',
 'treat_flu',
 'over_the_counter_flu_medicine',
 'acute_bronchitis',
 'robitussin',
 'contagious_flu',
 'thermoscan',
 'fever_flu',
 'expectorant',
 'strep_throat',
 'flu_contagious',
 'flu_incubation',
 'early_flu_symptoms',
 'flu_remedies',
 'flu_headache',
 'chest_cold',
 'cold_versus_flu',
 'cold_and_flu',
 'cure_flu',
 'tussin',
 'flu_medicine',
 'remedies_for_the_flu',
 'get_rid_of_the_flu',
 'over_the_counter_flu',
 'tessalon',
 'influenza_incubation_period',
 'ear_thermometer',
 'flu_children',
 'flu_and_fever',
 'flu_treatments',
 'break_a_fever',
 'reduce_a_fever',
 'flu_vs_cold',
 'treating_the_flu',
 'taking_temperature',
 'flu_versus_cold',
 'symptoms_of_the_flu',
 'influenza_symptoms',
 'bronchitis',
 'remedies_for_flu',
 'flu_cough',
 'high_fever',
 'cold_vs_flu',
 'signs_of_the_flu',
 'influenza_a_and_b',
 'flu_length',
 'body_temperature', 
 'ILI_lagwk1',
 'ILI_lagwk2', 
 'ILI_lagwk3',
 'ILI_lagwk4']]

# Independent variables
        # 3 independent variables from athena EHR 
            # [(flu visit counts)/ (total patient visit counts) 
            # (ILI visit counts)/ (total patient visit counts)
            # (unspecified viral or ILI visit counts)/ (total patient visit counts)]
        # CDC historical CDC_Unweighted_ILI values: collected from 2009 to 2016 (week 40 to 20)
        # 129 google search terms related to flu
        
      ======> 3 + 1 + 129
      
# Dependent variables 
        # 4 ILI weeks offset by 1 week

# split df to train dataset, dataset before year2015 and week 40 used at training dataset

In [9]:
train = model_74important_features_df[model_74important_features_df.index < 201540]
# train.head()


# defining  targets/labels to  y axis
                y_train = ILI lag week 1

                yy_train = ILI lag week 2

                yyy_train = ILI lag week 3

                yyyy_train = ILI lag week 4

In [10]:
y_train = train["ILI_lagwk1"]
# y_train

In [11]:
yy_train = train["ILI_lagwk2"]

In [12]:
yyy_train = train["ILI_lagwk3"]

In [13]:
yyyy_train =train["ILI_lagwk4"]

In [14]:
X_train = train.drop(["ILI_lagwk1", "ILI_lagwk2", "ILI_lagwk3", "ILI_lagwk4"], axis=1)
# X_train

# split df to train dataset,dataset after year2015 and week 40 used at training dataset

In [15]:
test = model_74important_features_df[model_74important_features_df.index >= 201540]
# test.head()

In [16]:
y_test = test["ILI_lagwk1"]

In [17]:
yy_test = test["ILI_lagwk2"]

In [18]:
yyy_test = test["ILI_lagwk3"]

In [19]:
yyyy_test = test["ILI_lagwk4"]

In [20]:
X_test = test.drop(["ILI_lagwk1", "ILI_lagwk2", "ILI_lagwk3", "ILI_lagwk4"], axis=1)

# building a model for y_test = test["ILI_lagwk1"]

In [21]:
gbrt1 = GradientBoostingRegressor(n_estimators = 100, max_depth = 5, min_samples_leaf = 9, 
                                  random_state = 0)

In [22]:
gbrt1.fit(X_train, y_train) 

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=9,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=0, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [23]:
y_pred = gbrt1.predict(X_test)

In [24]:
r2_score(y_test, y_pred)

0.7792685347472983

In [25]:
mean_squared_error(y_test, y_pred).round(3)

0.082

In [26]:
mean_absolute_error(y_test, y_pred).round(3)

0.214

In [27]:
print("R-squared for ILI_lagwk1 Train: %.3f" %gbrt1.score(X_train, y_train)) 
print("R-squared for ILI_lagwk1 Test : %.3f" %gbrt1.score(X_test, y_test)) 

R-squared for ILI_lagwk1 Train: 0.998
R-squared for ILI_lagwk1 Test : 0.779


In [28]:
predictions1 = gbrt1.predict(X_test) 

In [29]:
# predict the values of y
y_test_unraveled = y_test.values.ravel()
y_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [30]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk1": predictions1, "Actual": y_test_unraveled}).reset_index(drop=True)
pred_df.index = y_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk1,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.407798,1.33029
201541,1.408961,1.41368
201542,1.504146,1.36882
201543,1.439901,1.48309
201544,1.499151,1.54494


# # Create the GridSearchCV model

In [31]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {'learning_rate':[ 0.02, 0.01, 0.1, 0.05], 
#             'max_depth':[1, 3, 5 ,7], 
#             'min_samples_leaf':[3, 5, 7 ,9], 
#             'max_features':[0.1, 0.3, 1.0],
#             'n_estimators': [300, 500, 1000, 2000]
#              } 

# est = GradientBoostingRegressor()
# gs_cv = GridSearchCV(est, param_grid).fit(X_train, y_train)

# # best hyperparameter setting

# gs_cv.best_est 

# building a model for y_test = test["ILI_lagwk2"]

In [32]:
gbrt2 = GradientBoostingRegressor(n_estimators = 100, max_depth = 5, min_samples_leaf = 9, 
                                  random_state = 0)

In [33]:
gbrt2.fit(X_train, yy_train) 

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=9,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=0, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [34]:
yy_pred = gbrt2.predict(X_test)

In [35]:
r2_score(yy_test, yy_pred)

0.1848584790036324

In [36]:
mean_squared_error(yy_test, yy_pred).round(3)

0.299

In [37]:
mean_absolute_error(yy_test, yy_pred).round(3)

0.373

In [38]:
print("R-squared for ILI_lagwk2 Train: %.2f" %gbrt2.score(X_train, yy_train)) 
print("R-squared for ILI_lagwk2 Test: %.2f" %gbrt2.score(X_test, yy_test)) 

R-squared for ILI_lagwk2 Train: 1.00
R-squared for ILI_lagwk2 Test: 0.18


In [39]:
predictions2 = gbrt2.predict(X_test) 

In [40]:
# predict the values of y
yy_test_unraveled = yy_test.values.ravel()
yy_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [41]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk2": predictions2, "Actual": yy_test_unraveled}).reset_index(drop=True)
pred_df.index = yy_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk2,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.411119,1.41368
201541,1.462309,1.36882
201542,1.443631,1.48309
201543,1.708527,1.54494
201544,1.617081,1.62532


# building a model for y_test = test["ILI_lagwk3"]

In [42]:
gbrt3 = GradientBoostingRegressor(n_estimators = 100, max_depth = 5, min_samples_leaf = 9, 
                                  random_state = 0)

In [43]:
gbrt3.fit(X_train, yyy_train) 

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=9,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=0, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [44]:
yyy_pred = gbrt3.predict(X_test)

In [45]:
r2_score(yyy_test, yyy_pred)

0.12774584105243558

In [46]:
mean_squared_error(yyy_test, yyy_pred).round(3)

0.33

In [47]:
mean_absolute_error(yyy_test, yyy_pred).round(3)

0.451

In [48]:
print("R-squared for Train: %.3f" %gbrt3.score(X_train, yyy_train)) 
print("R-squared for Test: %.3f" %gbrt3.score(X_test, yyy_test)) 

R-squared for Train: 0.994
R-squared for Test: 0.128


In [49]:
predictions3 = gbrt3.predict(X_test) 

In [50]:
# predict the values of y
predictions = gbrt3.predict(X_test)
yyy_test_unraveled = yyy_test.values.ravel()
yyy_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [51]:
pred_df = pd.DataFrame({"Prediction_ILI_lagwk3": predictions3, "Actual": yyy_test_unraveled}).reset_index(drop=True)
pred_df.index = yyy_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk3,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.57523,1.36882
201541,1.766843,1.48309
201542,1.737605,1.54494
201543,1.803373,1.62532
201544,1.602201,1.91565


# building a model for y_test = test["ILI_lagwk4"]

In [52]:
gbrt4 = GradientBoostingRegressor(n_estimators = 100, max_depth = 5, min_samples_leaf = 9, 
                                  random_state = 0)

In [53]:
gbrt4.fit(X_train, yyyy_train) 

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=9,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=0, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [54]:
yyyy_pred = gbrt4.predict(X_test)

In [55]:
r2_score(yyyy_test, yyyy_pred)

0.12484854509579268

In [56]:
mean_squared_error(yyyy_test, yyyy_pred).round(3)

0.342

In [57]:
mean_absolute_error(yyyy_test, yyyy_pred).round(3)

0.447

In [58]:
print("R-squared for Train: %.3f" %gbrt4.score(X_train, yyyy_train)) 
print("R-squared for Test: %.3f" %gbrt4.score(X_test, yyyy_test)) 

R-squared for Train: 0.996
R-squared for Test: 0.125


In [59]:
# predict the values of y
predictions4 = gbrt4.predict(X_test)
yyyy_test_unraveled = yyyy_test.values.ravel()
yyyy_test.index

Int64Index([201540, 201541, 201542, 201543, 201544, 201545, 201546, 201547,
            201548, 201549, 201550, 201551, 201552, 201601, 201602, 201603,
            201604, 201605, 201606, 201607, 201608, 201609, 201610, 201611,
            201612, 201613, 201614, 201615, 201616, 201617, 201618, 201619,
            201620],
           dtype='int64', name='YearWeek')

In [60]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk1": predictions4, "Actual": yyyy_test_unraveled}).reset_index(drop=True)
pred_df.index = yyyy_test.index
pred_df.head()

Unnamed: 0_level_0,Prediction_ILI_lagwk1,Actual
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1
201540,1.647414,1.48309
201541,1.567865,1.54494
201542,1.745921,1.62532
201543,1.736107,1.91565
201544,1.884959,1.74368
