In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

import seaborn as sns
np.random.seed(0)

In [3]:
df1 = pd.read_csv("ML_model_wk40_to_20.csv")
df1.dropna(how='any', inplace=True)
del df1['ILI_weeks']
del df1['Unnamed: 0']
#del df1['week']
df1.head()

Unnamed: 0,Year,week,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,...,walking_pneumonia,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
0,2009,40,44,33,81,69,22,40,35,36,...,47,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927
1,2009,41,51,51,77,46,24,43,35,43,...,53,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349
2,2009,42,63,38,88,76,30,48,34,45,...,57,82,0.02078,0.02626,0.04458,7.61889,7.38836,6.33927,4.94349,3.80996
3,2009,43,70,51,100,88,29,54,39,55,...,64,54,0.02862,0.035,0.05885,7.38836,6.33927,4.94349,3.80996,3.44106
4,2009,44,53,52,96,44,30,55,44,53,...,59,77,0.02927,0.03515,0.05945,6.33927,4.94349,3.80996,3.44106,2.66773


In [4]:
def combine_year_week(row):
    return int(row["Year"]) * 100 + int(row["week"])

In [5]:
df1["YearWeek"] = df1.apply(combine_year_week, axis=1)

In [6]:
df1.head()

Unnamed: 0,Year,week,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,...,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4,YearWeek
0,2009,40,44,33,81,69,22,40,35,36,...,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927,200940
1,2009,41,51,51,77,46,24,43,35,43,...,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349,200941
2,2009,42,63,38,88,76,30,48,34,45,...,82,0.02078,0.02626,0.04458,7.61889,7.38836,6.33927,4.94349,3.80996,200942
3,2009,43,70,51,100,88,29,54,39,55,...,54,0.02862,0.035,0.05885,7.38836,6.33927,4.94349,3.80996,3.44106,200943
4,2009,44,53,52,96,44,30,55,44,53,...,77,0.02927,0.03515,0.05945,6.33927,4.94349,3.80996,3.44106,2.66773,200944


In [8]:
df1.index = df1["YearWeek"]
df_ili_wklag1 = df1.drop(["Year", "week", "YearWeek", "ILI_lagwk2", "ILI_lagwk3", "ILI_lagwk4"], axis=1)


In [10]:
df_ili_wklag1.head(2)

Unnamed: 0_level_0,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,cold_or_flu,cold_versus_flu,...,tussionex,type_a_influenza,upper_respiratory,walking_pneumonia,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,44,33,81,69,22,40,35,36,37,30,...,67,78,44,47,56,0.01338,0.01763,0.03074,5.66087,6.81522
200941,51,51,77,46,24,43,35,43,49,41,...,64,77,45,53,58,0.0162,0.02103,0.03554,6.81522,7.61889


In [11]:
train = df_ili_wklag1[df_ili_wklag1.index < 201540]
test = df_ili_wklag1[df_ili_wklag1.index >= 201540]

In [14]:
X_train = train.drop(["ILI_lagwk1"], axis=1)
y_train = train[["ILI_lagwk1"]]
X_test = test.drop(["ILI_lagwk1"], axis=1)
y_test = test[["ILI_lagwk1"]]

In [103]:
# independent variables
# 3 independent variables from athena EHR 
# [(flu visit counts)/ (total patient visit counts) 
# (ILI visit counts)/ (total patient visit counts)
# (unspecified viral or ILI visit counts)/ (total patient visit counts)]
# CDC historical CDC_Unweighted_ILI values: collected from 2009 to 2016 (week 40 to 20)
# 129 google search terms related to flu
# 3 + 1 + 129


In [104]:
# dependent varables are "ILI_lagwk1", "ILI_lagwk2", "ILI_lagwk3", "ILI_lagwk4"

In [15]:
X.shape, y.shape

NameError: name 'X' is not defined

In [16]:
X[X.index > 2015]

NameError: name 'X' is not defined

### training data will flu season from 2009 to 2015
# testing data will flu season 2016
# 231 rows × 137 columns


# https://www.youtube.com/watch?v=YkVscKsV_qk

# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor

In [109]:
rreg = RandomForestRegressor(max_depth = 5, n_estimators = 70,  random_state = 0)                           

In [110]:
rreg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [111]:
# RF predictors variable importance
# store most important variavles under importances
importances = rreg.feature_importances_

In [112]:
sorted(zip(importances, data.columns), reverse=True)

[(0.37968963288237667, 'CDC_Unweighted_ILI'),
 (0.07334341357530601, 'over_the_counter_flu_medicine'),
 (0.04539595360487909, 'oscillococcinum'),
 (0.035010928206534106, 'robitussin'),
 (0.03260489328895029, 'how_to_treat_the_flu'),
 (0.028677991238524506, 'flu_remedies'),
 (0.02163696517914514, 'sinus'),
 (0.015762760568277488, 'type_a_influenza'),
 (0.014851596796613381, 'sinus_infections'),
 (0.01246760862667083, 'flu_remedy'),
 (0.011445228005060617, 'ILI_Visit_Count'),
 (0.0110120135866399, 'flu_duration'),
 (0.010502406880963442, 'get_over_the_flu'),
 (0.010390908509161051, 'over_the_counter_flu'),
 (0.010345044831638653, 'low_body'),
 (0.009433460309020331, 'cure_the_flu'),
 (0.009366430010194339, 'remedies_for_the_flu'),
 (0.009331564001302316, 'how_to_get_rid_of_the_flu'),
 (0.009125643907659893, 'get_rid_of_the_flu'),
 (0.009023819957423513, 'flu_germs'),
 (0.00896850961421362, 'influenza_incubation_period'),
 (0.008836659895766024, 'flu_or_cold'),
 (0.008679819195927698, 'fl

In [113]:
# sort imprtant varibles and sotre them under indices
# indices = np.argsort(importances)[::-1]
# for f in range(X.shape[1]):
#     print("%d. feature %d (%f)" % (f +1, indices[f], importances[indices[f]])) 

In [114]:
# rreg.feature_importances_.sum()

In [115]:
# np.argsort(rreg.feature_importances_)

In [116]:
# plt.plot(np.cumsum(rreg.feature_importances_));

In [117]:
# predict the values of y
y_pred = rreg.predict(X_test)

In [118]:
# the prdicted ILI values
# y_pred

In [119]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [120]:
# y_test are the hold out values i.e. the actual values of ILIs, y_pred are the predicted values of ILIs
# mean square error (8133187.704816607) is the magnitude of difference between our actual CDC ILI values 
# and predicted ILI values. 
mean_squared_error(y_test, y_pred)

0.20635616207931512

In [121]:
mean_absolute_error(y_test, y_pred)

0.3386944004539239

In [122]:
from sklearn.metrics import r2_score

In [123]:
r2_score(y_test, y_pred)

0.4572913182038027

In [124]:
# Print the r2 score for the test data
# YOUR CODE HERE
print(f"R-squared for Train: {rreg.score(X_train, y_train)}")
print(f"R-squared for Test:  {rreg.score(X_test, y_test)}")

R-squared for Train: 0.9368715319721965
R-squared for Test:  0.45183076575713066


In [125]:
print("R-squared for Train: %.2f" %rreg.score(X_train, y_train)) 
print("R-squared for Test: %.2f" %rreg.score(X_test, y_test)) 

R-squared for Train: 0.94
R-squared for Test: 0.45


In [126]:
# Make predictions using the X_test and y_test data
# Print at least 10 predictions vs their actual labels
# predictions = rreg.predict(X_test)
print(f"First 10 Predictions: {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10]}")
# print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions: [[1.47887916 1.50776662 1.55105597 1.61128666]
 [1.53123439 1.57714633 1.67157038 1.77383233]
 [1.57588361 1.61959707 1.7243171  1.84332032]
 [1.63388446 1.72188146 1.87626873 2.0904892 ]
 [1.60517787 1.6773264  1.81157473 2.01097882]
 [1.68638792 1.77322706 1.87109842 2.00074252]
 [1.76682722 1.85782855 1.96333147 2.18290095]
 [1.83337956 1.95021562 2.06982546 2.28784327]
 [1.9281391  2.08376538 2.2783654  2.63803293]
 [2.05040072 2.20319152 2.35928218 2.645258  ]]
First 10 Actual labels:           ILI_lagwk1  ILI_lagwk2  ILI_lagwk3  ILI_lagwk4
YearWeek                                                
201540       1.33029     1.41368     1.36882     1.48309
201541       1.41368     1.36882     1.48309     1.54494
201542       1.36882     1.48309     1.54494     1.62532
201543       1.48309     1.54494     1.62532     1.91565
201544       1.54494     1.62532     1.91565     1.74368
201545       1.62532     1.91565     1.74368     1.83277
201546       1.91565     1.

In [127]:
# pred_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
# pred_df.head()

# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [39]:
# from sklearn.model_selection import GridSearchCV
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# }
# # Create a based model
# rf = RandomForestRegressor()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 2)