In [9]:
%matplotlib inline
    
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [11]:
# final df for model building
df1 = pd.read_csv("ML_model_wk40_to_20.csv")
df1.dropna(how='any', inplace=True)
del df1['ILI_weeks']
del df1['Unnamed: 0']
#del df1['week']
df1.head(2)

Unnamed: 0,Year,week,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,...,walking_pneumonia,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
0,2009,40,44,33,81,69,22,40,35,36,...,47,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927
1,2009,41,51,51,77,46,24,43,35,43,...,53,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349


In [12]:
#  combining year and week column 
def combine_year_week(row):
    return int(row["Year"]) * 100 + int(row["week"])

In [13]:
df1["YearWeek"] = df1.apply(combine_year_week, axis=1)

In [14]:
# reset index to YearWeek
df1.index = df1["YearWeek"]

In [15]:
df1.head()

Unnamed: 0_level_0,Year,week,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,...,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4,YearWeek
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,2009,40,44,33,81,69,22,40,35,36,...,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927,200940
200941,2009,41,51,51,77,46,24,43,35,43,...,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349,200941
200942,2009,42,63,38,88,76,30,48,34,45,...,82,0.02078,0.02626,0.04458,7.61889,7.38836,6.33927,4.94349,3.80996,200942
200943,2009,43,70,51,100,88,29,54,39,55,...,54,0.02862,0.035,0.05885,7.38836,6.33927,4.94349,3.80996,3.44106,200943
200944,2009,44,53,52,96,44,30,55,44,53,...,77,0.02927,0.03515,0.05945,6.33927,4.94349,3.80996,3.44106,2.66773,200944


In [16]:
model_df = df1.drop(["Year", "week", "YearWeek"], axis=1)
model_df.head(2)

Unnamed: 0_level_0,a_influenza,acute_bronchitis,body_temperature,braun_thermoscan,break_a_fever,bronchitis,chest_cold,cold_and_flu,cold_or_flu,cold_versus_flu,...,walking_pneumonia,what_to_do_if_you_have_the_flu,Flu_Visit_Count,ILI_Visit_Count,Unspecified,CDC_Unweighted_ILI,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,44,33,81,69,22,40,35,36,37,30,...,47,56,0.01338,0.01763,0.03074,5.66087,6.81522,7.61889,7.38836,6.33927
200941,51,51,77,46,24,43,35,43,49,41,...,53,58,0.0162,0.02103,0.03554,6.81522,7.61889,7.38836,6.33927,4.94349


In [17]:
model_20important_features_df = model_df[[
'CDC_Unweighted_ILI',
 'type_a_influenza',
 'influenza_type_a',
 'Flu_Visit_Count',
 'ILI_Visit_Count',
 'how_to_treat_flu',
 'flu_remedy',
 'exposed_to_flu',
 'get_over_the_flu',
 'sinus',
 'cure_the_flu',
 'incubation_period_for_the_flu',
 'treat_the_flu',
 'how_to_treat_the_flu',
 'low_body',
 'influenza_treatment',
 'oscillococcinum',
 'flu_contagious_period',
 'tussionex',
 'flu_care', 
 'ILI_lagwk1',
 'ILI_lagwk2', 
 'ILI_lagwk3',
 'ILI_lagwk4']]

In [18]:
model_20important_features_df.head()

Unnamed: 0_level_0,CDC_Unweighted_ILI,type_a_influenza,influenza_type_a,Flu_Visit_Count,ILI_Visit_Count,how_to_treat_flu,flu_remedy,exposed_to_flu,get_over_the_flu,sinus,...,low_body,influenza_treatment,oscillococcinum,flu_contagious_period,tussionex,flu_care,ILI_lagwk1,ILI_lagwk2,ILI_lagwk3,ILI_lagwk4
YearWeek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200940,5.66087,78,78,0.01338,0.01763,50,31,59,28,58,...,84,36,69,45,67,63,6.81522,7.61889,7.38836,6.33927
200941,6.81522,77,77,0.0162,0.02103,32,47,78,44,60,...,85,40,67,56,64,75,7.61889,7.38836,6.33927,4.94349
200942,7.61889,94,94,0.02078,0.02626,66,58,90,40,60,...,89,44,60,64,63,92,7.38836,6.33927,4.94349,3.80996
200943,7.38836,100,100,0.02862,0.035,60,49,99,58,62,...,99,68,95,98,62,100,6.33927,4.94349,3.80996,3.44106
200944,6.33927,77,77,0.02927,0.03515,80,64,99,47,59,...,97,46,84,100,66,97,4.94349,3.80996,3.44106,2.66773


# Independent variables
        # 3 independent variables from athena EHR 
            # [(flu visit counts)/ (total patient visit counts) 
            # (ILI visit counts)/ (total patient visit counts)
            # (unspecified viral or ILI visit counts)/ (total patient visit counts)]
        # CDC historical CDC_Unweighted_ILI values: collected from 2009 to 2016 (week 40 to 20)
        # 74 google search terms related to flu
        
      ======> 3 + 1 + 74
      
# Dependent variables 
        # 4 ILI weeks offset by 1 week

# split df to train dataset, dataset before year2015 and week 40 used at training dataset

In [19]:
train = model_df[model_df.index < 201540]
# train.head()


# defining  targets/labels to  y axis
                y_train = ILI lag week 1

                yy_train = ILI lag week 2

                yyy_train = ILI lag week 3

                yyyy_train = ILI lag week 4

In [None]:
y_train = train["ILI_lagwk1"]
# y_train

In [None]:
yy_train = train["ILI_lagwk2"]

In [None]:
yyy_train = train["ILI_lagwk3"]

In [None]:
yyyy_train =train["ILI_lagwk4"]

In [None]:
X_train = train.drop(["ILI_lagwk1", "ILI_lagwk2", "ILI_lagwk3", "ILI_lagwk4"], axis=1)
# X_train

# split df to train dataset,dataset after year2015 and week 40 used at training dataset

In [None]:
test = model_df[model_df.index >= 201540]
# test.head()

In [None]:
y_test = test["ILI_lagwk1"]

In [None]:
yy_test = test["ILI_lagwk2"]

In [None]:
yyy_test = test["ILI_lagwk3"]

In [None]:
yyyy_test = test["ILI_lagwk4"]

In [None]:
X_test = test.drop(["ILI_lagwk1", "ILI_lagwk2", "ILI_lagwk3", "ILI_lagwk4"], axis=1)

# Reference https://shankarmsy.github.io/stories/gbrt-sklearn.html#

##https://www.youtube.com/watch?v=IXZKgIsZRm0

# building a model for y_test = test["ILI_lagwk1"]

In [None]:
rreg1 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)   

In [None]:
rreg1.fit(X_train, y_train)

In [None]:
y_pred = rreg1.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
print("R-squared for ILI_lagwk1 Train: %.2f" %rreg1.score(X_train, y_train)) 
print("R-squared for ILI_lagwk1 Test : %.2f" %rreg1.score(X_test, y_test)) 

In [None]:
predictions1 = rreg1.predict(X_test) 

In [None]:
# predict the values of y
predictions = rreg1.predict(X_test)
y_test_unraveled = y_test.values.ravel()
y_test.index

In [None]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk1": predictions1, "Actual": y_test_unraveled}).reset_index(drop=True)
pred_df.index = y_test.index
pred_df.head()

In [None]:
# pred_df.to_csv("Prediction_ILI_lagwk1.csv")
# pred_df.head() 

# # Create the GridSearchCV model

In [None]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {'learning_rate':[ 0.02, 0.01, 0.1, 0.05], 
#             'max_depth':[1, 3, 5 ,7], 
#             'min_samples_leaf':[3, 5, 7 ,9], 
# #             'max_features':[0.1,0.3,1.0],
#             'n_estimators': [300, 500, 1000, 2000]
#              } 

# est = GradientBoostingRegressor()
# gs_cv = GridSearchCV(est, param_grid).fit(X_train, y_train)

# # best hyperparameter setting

# gs_cv.best_est 

# selecting important features  ["ILI_lagwk1"]

In [None]:
# # GBRF predictors variable importance
# store most important variables under importances
importances = rreg1.feature_importances_

In [None]:
rreg1.feature_importances_.sum()

In [None]:
# # store most important variables/features under importances
importances = rreg1.feature_importances_
importances

In [None]:
sorted_features = sorted(zip(importances, X_train.columns), reverse=True)
sorted_features

In [None]:
# # sort important varibles and sotre them under indices
important_columns = []
for fimportance, name in sorted(zip(importances, X_train.columns), reverse=True):
    if fimportance > 0.001:
        important_columns.append(name)

In [None]:
important_feature_columns = important_columns
important_feature_columns

In [None]:
important_features20_df = model_df[important_feature_columns]
important_features20_df.head()

In [None]:
# important_features74_df.to_csv("ML_model_wk40_to_20_with_74_important_features.csv")
# important_features74_df.head()

# defining important feature X train and X test dataset

In [None]:
# X_trainim = important_features_df[important_features_df.index < 201540]
# # trainim
# # .head(2)

In [None]:
# X_testim = important_features_df[important_features_df.index >= 201540]
# # testim
# # .head()

In [None]:
# gbrtim_ili_lag1 = GradientBoostingRegressor(n_estimators = 500, max_depth = 5) # number of sequential trees to be modeled

In [None]:
# gbrtim_ili_lag1.fit(X_trainim, y_train) 

In [None]:
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# r2_score(y_test, y_pred)

In [None]:
# mean_absolute_error(y_test, y_pred)

In [None]:
# # GBR with important features for ili_lag1 week 1
# print("R-squared for Train gbrtim_ili_lag1: %.2f" %gbrtim_ili_lag1.score(X_trainim, y_train)) 
# print("R-squared for Test gbrtim_ili_lag1: %.2f" %gbrtim_ili_lag1.score(X_testim, y_test)) 

In [None]:
# # predict the values of y with important feature extraction for ili week 1
# predictions = gbrtim_ili_lag1.predict(X_testim)
# y_test_unraveled = y_test.values.ravel()
# y_test.index

In [None]:
# Make predictions using the X_test and y_test data
# Print at least 10 predictions vs their actual labels
# predictions = gbrtim_ili_lag1.predict(X_testim)
# print(f"First 10 Predictions: {predictions[:10]}")
# print(f"First 10 Actual labels: {y_test_unraveled[:10]}")

In [None]:
# # Print predictions vs their actual labels
# pred_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test_unraveled}).reset_index(drop=True)
# pred_df.index = y_test.index
# pred_df.head()

# building a model for y_test = test["ILI_lagwk2"]

In [None]:
rreg2 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)   

In [None]:
rreg2.fit(X_train, yy_train) 

In [None]:
yy_pred = rreg2.predict(X_test)

In [None]:
r2_score(yy_test, yy_pred)

In [None]:
mean_absolute_error(yy_test, yy_pred)

In [None]:
print("R-squared for ILI_lagwk2 Train: %.2f" %rreg2.score(X_train, yy_train)) 
print("R-squared for ILI_lagwk2 Test: %.2f" %rreg2.score(X_test, yy_test)) 

In [None]:
predictions2 = rreg2.predict(X_test) 

In [None]:
# predict the values of y
yy_test_unraveled = yy_test.values.ravel()
yy_test.index

In [None]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk2": predictions2, "Actual": yy_test_unraveled}).reset_index(drop=True)
pred_df.index = yy_test.index
pred_df.head()

In [None]:
# # Create the GridSearchCV model

# from sklearn.model_selection import GridSearchCV
# param_grid = {'learning_rate':[ 0.02, 0.01, 0.1, 0.05], 
#             'max_depth':[1, 3, 5 ,7], 
#             'min_samples_leaf':[3, 5, 7 ,9], 
# #             'max_features':[0.1,0.3,1.0],
#             'n_estimators': [300, 500, 1000, 2000]
#              } 

# est = GradientBoostingRegressor()
# gs_cv = GridSearchCV(est, param_grid).fit(X_train, y_train)

# # best hyperparameter setting

# gs_cv.best_est 

# selecting important features  ["ILI_lagwk2"]

# defining important feature X train and X test dataset

# building a model for y_test = test["ILI_lagwk3"]

In [None]:
rreg3 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)   

In [None]:
rreg3.fit(X_train, yyy_train) 

In [None]:
yyy_pred = rreg3.predict(X_test)

In [None]:
r2_score(yyy_test, yyy_pred)

In [None]:
mean_absolute_error(yyy_test, yyy_pred)

In [None]:
print("R-squared for Train: %.2f" %rreg3.score(X_train, yyy_train)) 
print("R-squared for Test: %.2f" %rreg3.score(X_test, yyy_test)) 

In [None]:
predictions3 = rreg3.predict(X_test) 

In [None]:
# predict the values of y
predictions = rreg3.predict(X_test)
yyy_test_unraveled = yyy_test.values.ravel()
yyy_test.index

In [None]:
pred_df = pd.DataFrame({"Prediction_ILI_lagwk3": predictions3, "Actual": yyy_test_unraveled}).reset_index(drop=True)
pred_df.index = yyy_test.index
pred_df.head()

# building a model for y_test = test["ILI_lagwk4"]

In [None]:
rreg4 = RandomForestRegressor(max_depth = 5, n_estimators = 300,  random_state = 0, min_samples_leaf = 8)  

In [None]:
rreg4.fit(X_train, yyyy_train) 

In [None]:
yyyy_pred = rreg4.predict(X_test)

In [None]:
r2_score(yyyy_test, yyyy_pred)

In [None]:
mean_absolute_error(yyyy_test, yyyy_pred)

In [None]:
print("R-squared for Train: %.2f" %rreg4.score(X_train, yyyy_train)) 
print("R-squared for Test: %.2f" %rreg4.score(X_test, yyyy_test)) 

In [None]:
# predict the values of y
predictions4 = rreg4.predict(X_test)
yyyy_test_unraveled = yyyy_test.values.ravel()
yyyy_test.index

In [None]:
# store prediction and actual to df
pred_df = pd.DataFrame({"Prediction_ILI_lagwk1": predictions4, "Actual": yyyy_test_unraveled}).reset_index(drop=True)
pred_df.index = yyyy_test.index
pred_df.head()