### DengAI-Classifier
Notebook to train a classifier which indicates whether or not the total cases is in a particular class of severity.

This notebook tested whether or not a classifier could be created based on the test data to indicate the severity of an outbreak. The classifier was then applied to the holdout data to assign a severity. Severity has a high correlation to total_cases, so severity was used as an input feature to training several models including a DNN and BinomialRegression model. It should be noted that the training data has very few records which fall into the 'severe outbreak' category so SMOTE was employed to try to balance the dataset. Since this disrupts the time series, the models were trained using a random selection of data.

#### Data and Environment Prep

In [None]:
#Run the utils notebook
%run DengAI-Utils.ipynb


#### Classifier without Using SMOTE

In [71]:
from sklearn.model_selection import train_test_split
#Load data
df_i,df_l,df_h=load_all_data()
df_h=load_holdout_data()
df_i=interp(df_i)
df_h=interp(df_h)

#select each city
df_i_sj=df_i[df_i['city']=='sj']
df_h_sj=df_h[df_h['city']=='sj']
df_l_sj=df_i_sj['outbreak_severity']

df_i_sj=df_i_sj.drop(columns=['city','week_start_date','outbreak_severity','year','total_cases','avg_total_cases_2_wks','cum_total_cases_2_wks'])
df_h_sj=df_h_sj.drop(columns=['city','week_start_date','outbreak_severity','year','total_cases','avg_total_cases_2_wks','cum_total_cases_2_wks'])

df_i_iq=df_i[df_i['city']=='iq']
df_h_iq=df_h[df_h['city']=='iq']
df_l_iq=df_i_iq['outbreak_severity']
df_i_iq.drop(columns=['city','week_start_date','outbreak_severity','year','total_cases','avg_total_cases_2_wks','cum_total_cases_2_wks'],inplace=True)
df_h_iq.drop(columns=['city','week_start_date','outbreak_severity','year','total_cases','avg_total_cases_2_wks','cum_total_cases_2_wks'],inplace=True)

#randomly segment in to train, test, valid
df_x_train_sj,df_x_test_sj,df_y_train_sj,df_y_test_sj=train_test_split(df_i_sj,df_l_sj,test_size=0.2,random_state=42) #split to get the validation file
#df_x_train_sj,df_x_test_sj,df_y_train_sj,df_y_test_sj=train_test_split(df_x_train_sj,df_y_train_sj,test_size=0.25,random_state=2) #further split to get train and test

df_x_train_iq,df_x_test_iq,df_y_train_iq,df_y_test_iq=train_test_split(df_i_iq,df_l_iq,test_size=0.2,random_state=42) #split to get the validation file
#df_x_train_iq,df_x_test_iq,df_y_train_iq,df_y_test_iq=train_test_split(df_x_train_iq,df_y_train_iq,test_size=0.25,random_state=0) #further split to get train and test



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

rfc_sj=RandomForestClassifier(n_estimators=100,class_weight={0:18,1:88,2:96,3:99,4:99},max_depth=1)
results_sj=rfc_sj.fit(df_x_train_sj,df_y_train_sj)

y_pred_sj=rfc_sj.predict(df_x_test_sj)
score_sj=rfc_sj.score(df_x_test_sj,df_y_test_sj)

rfc_iq=RandomForestClassifier(n_estimators=100,class_weight={0:10,1:92,2:99},max_depth=5)
results_iq=rfc_iq.fit(df_x_train_iq,df_y_train_iq)

y_pred_iq=rfc_iq.predict(df_x_test_iq)
score_iq=rfc_iq.score(df_x_test_iq,df_y_test_iq)

print('Accuracy score for sj: ' + str(score_sj))
print(confusion_matrix(df_y_test_sj,y_pred_sj))
#print(classification_report(df_y_test_sj,y_pred_sj))
print('-'*50)
print('Accuracy score for iq: ' + str(score_iq))
print(confusion_matrix(df_y_test_iq,y_pred_iq))
#print(classification_report(df_y_test_iq,y_pred_iq))

In [49]:
#predict holdout values
#predict the values for the holdout set
h_pred_sj=rfc_sj.predict(df_h_sj)
h_pred_iq=rfc_iq.predict(df_h_iq)

#### Classifier Using SMOTE

In [75]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

#Load data
df_i,df_l,df_h=load_all_data()
df_h=load_holdout_data()
df_i=interp(df_i)
df_h=interp(df_h)

#select each city
df_i_sj=df_i[df_i['city']=='sj']
df_h_sj=df_h[df_h['city']=='sj']
df_l_sj=df_i_sj['outbreak_severity']

df_i_sj=df_i_sj.drop(columns=['city','week_start_date','outbreak_severity','year','total_cases','avg_total_cases_2_wks','cum_total_cases_2_wks'])
df_h_sj=df_h_sj.drop(columns=['city','week_start_date','outbreak_severity','year','total_cases','avg_total_cases_2_wks','cum_total_cases_2_wks'])

df_i_iq=df_i[df_i['city']=='iq']
df_h_iq=df_h[df_h['city']=='iq']
df_l_iq=df_i_iq['outbreak_severity']
df_i_iq.drop(columns=['city','week_start_date','outbreak_severity','year','total_cases','avg_total_cases_2_wks','cum_total_cases_2_wks'],inplace=True)
df_h_iq.drop(columns=['city','week_start_date','outbreak_severity','year','total_cases','avg_total_cases_2_wks','cum_total_cases_2_wks'],inplace=True)

#create smote records for each city dataset
smt_sj=SMOTE(random_state=42)
df_i_sj,df_l_sj=smt_sj.fit_sample(df_i_sj,df_l_sj)

smt_iq=SMOTE(random_state=42)
df_i_iq,df_l_iq=smt_iq.fit_sample(df_i_iq,df_l_iq)

#Create test and train splits
df_x_train_sj,df_x_test_sj,df_y_train_sj,df_y_test_sj=train_test_split(df_i_sj,df_l_sj,test_size=0.2,random_state=42) #split to get the validation file
df_x_train_iq,df_x_test_iq,df_y_train_iq,df_y_test_iq=train_test_split(df_i_iq,df_l_iq,test_size=0.2,random_state=42) #split to get the validation file

#Create random forest classifier and run the model
rfc_sj=RandomForestClassifier(n_estimators=50,class_weight={0:18,1:88,2:96,3:99,4:99})
results_sj=rfc_sj.fit(df_x_train_sj,df_y_train_sj)

y_pred_sj=rfc_sj.predict(df_x_test_sj)
score_sj=rfc_sj.score(df_x_test_sj,df_y_test_sj)

rfc_iq=RandomForestClassifier(n_estimators=50,class_weight={0:10,1:92,2:99})
results_iq=rfc_iq.fit(df_x_train_iq,df_y_train_iq)

y_pred_iq=rfc_iq.predict(df_x_test_iq)
score_iq=rfc_iq.score(df_x_test_iq,df_y_test_iq)

print('Accuracy score for sj: ' + str(score_sj))
print(confusion_matrix(df_y_test_sj,y_pred_sj))
print('-'*50)
print(classification_report(df_y_test_sj,y_pred_sj))
print('-'*50)
print('-'*50)
print('Accuracy score for iq: ' + str(score_iq))
print('-'*50)
print(confusion_matrix(df_y_test_iq,y_pred_iq))
print(classification_report(df_y_test_iq,y_pred_iq))


Accuracy score for sj: 0.9919678714859438
[[165   3   0   0   0]
 [  0 141   1   0   0]
 [  2   0 146   0   0]
 [  0   0   0 143   0]
 [  0   0   0   0 146]]
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       168
           1       0.98      0.99      0.99       142
           2       0.99      0.99      0.99       148
           3       1.00      1.00      1.00       143
           4       1.00      1.00      1.00       146

    accuracy                           0.99       747
   macro avg       0.99      0.99      0.99       747
weighted avg       0.99      0.99      0.99       747

--------------------------------------------------
--------------------------------------------------
Accuracy score for iq: 0.9925650557620818
--------------------------------------------------
[[82  2  0]
 [ 0 99  0]
 [ 0  0 86]]
              precision    recall  f1-score   support

           0   

##### Predict Class Values for Holdout Set

In [76]:
#predict the values for the holdout set
h_pred_sj=rfc_sj.predict(df_h_sj)
h_pred_iq=rfc_iq.predict(df_h_iq)

#create a single series of results
h_pred_combined=np.hstack((h_pred_sj,h_pred_iq))

#save the estimated class labels to the holdout file
df_h['outbreak_severity']=h_pred_combined
df_h.to_csv('holdout_all.csv')


##### Test MAE Just Using Severity

In [77]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from statistics import mean

#Load data
df_i,df_l,df_h=load_all_data()
df_h=load_holdout_data()
df_i=interp(df_i)
df_h=interp(df_h)

#select each city
df_i_sj=df_i[df_i['city']=='sj']
df_h_sj=df_h[df_h['city']=='sj']
df_l_sj=df_l[df_l['city']=='sj']

#Create test and train splits
df_x_train_sj,df_x_test_sj,df_y_train_sj,df_y_test_sj=train_test_split(df_i_sj,df_l_sj,test_size=0.2,random_state=42) #split to get the validation file

rfr=RandomForestRegressor(n_estimators=100,max_depth=6,criterion='mae',random_state=42)
#score=(cross_val_score(rfr,df_x_train_sj['outbreak_severity'],df_y_train_sj['total_cases'],scoring='neg_mean_absolute_error',cv=3))

df_x_train_sj=np.array(df_x_train_sj['outbreak_severity'])
df_x_train_sj=np.reshape(df_x_train_sj,(-1,1))
df_x_test_sj=np.array(df_x_test_sj['outbreak_severity'])
df_x_test_sj=np.reshape(df_x_test_sj,(-1,1))
df_x_train_sj.shape
df_y_train_sj['total_cases'].shape

rfr.fit(df_x_train_sj,df_y_train_sj['total_cases'])
pred=rfr.predict(df_x_test_sj)
mae_score=mean_absolute_error(pred,df_y_test_sj['total_cases'])
mae_score

11.598743169398908

#### Results
Despite a good fit to the model, the severity score did not generalize well to the holdout set. Using the severity classifier in several models only reduced the accuracy of those models