In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
import sklearn.model_selection as model_selection
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pickle
import xgboost as xgb
from sklearn.metrics import f1_score, precision_score, recall_score,accuracy_score, make_scorer, classification_report, confusion_matrix
from scipy import stats
from scipy.stats import randint

In [2]:
#drops duplicated columns created
def dropDuplicates(data):
    duplicates = data.columns[data.columns.duplicated()]
    if len(duplicates) > 0:
        data = data.loc[:,~data.columns.duplicated()]
    return data

#drops all the columns not used in X_train
def colsToDrop(dataframe):
    if 'outcome' in dataframe.columns:
        dataframe = dataframe.drop('outcome',1)
        print("dropping the outcome column")
    if 'Combined_Key' in dataframe.columns:
        dataframe = dataframe.drop('Combined_Key',1)
        print("dropping Combined_Key")
    if 'dist_between_in_km' in dataframe.columns:
        dataframe = dataframe.drop('dist_between_in_km',1)
        print("dropping dist in km column")
    if 'date_confirmation' in dataframe.columns:
        dataframe = dataframe.drop('date_confirmation',1)
        print("dropping date")
    if 'date_labels' in dataframe.columns:
        dataframe.date_labels = dataframe.date_labels.astype('object')
        print("converting date to categorical")
    if 'age' in dataframe.columns:
        dataframe.age = dataframe.age.astype('object')
        print("converting age to categorical")
    return dataframe
    
    # once hot encode and add new cols to dataframe
def oneHotEncode_df(dataframe):
    dataframe = colsToDrop(dataframe)
    col2Encode = list(dataframe.select_dtypes(include=['object'])) #gets a list of all the features that are objects assumption is that those are categorical
    dummies = pd.get_dummies(dataframe,columns=col2Encode,prefix=col2Encode,sparse=True)
    res = pd.concat([dataframe, dummies], axis=1)
    #if we decide to drop one hot encoded values
    res = res.drop(col2Encode, axis=1)
    output = dropDuplicates(res)
    return output

    # SMOTENC needs a list of the indices of all the categorical variables in dataset
    # ex: if country is column 2 and age is col 13. returns [2,13]
def getCategoricalIndices(dataframe):
    #get the indices of all the categorical variables
    numericalVar = list(dataframe.select_dtypes(include=['float64']))
    catlist = list(dataframe.columns.difference(numericalVar))
    print("these are the categorical features: {}".format(catlist))
    indlist = []
    for i in catlist:
        indlist.append(dataframe.columns.get_loc(i))
    return indlist

# Read Data in

In [3]:
df = pd.read_csv('deceased20percentwithDate.csv')

In [4]:
# ensure age is categorical, drop unwanted columns
df.age = df.age.astype('object')
df.date_labels = df.date_labels.astype('object')
y_tr = df.outcome
X_tr = df
if 'outcome' in X_tr.columns:
    X_tr = X_tr.drop('outcome',1)
    print("dropping the outcome column")
if 'Unnamed: 0' in X_tr.columns:
    X_tr = X_tr.drop('Unnamed: 0',1)
    print("dropping the Unnamed column")      

dropping the outcome column
dropping the Unnamed column


In [6]:
# add dummy variables  
ohe_df = oneHotEncode_df(X_tr)
ohe_df
res = ohe_df.to_numpy()

converting date to categorical
converting age to categorical


In [7]:
%%time
score_dict = {'deceased_f1' : make_scorer(f1_score, average='micro', labels=['deceased']),
              'deceased_recall' : make_scorer(recall_score, average='micro', labels=['deceased']),
              'overall_accuracy' : make_scorer(accuracy_score),
              'overall_recall' : make_scorer(recall_score, average='micro')
             }

X = res
y = df['outcome'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

CPU times: user 29.4 s, sys: 6.31 s, total: 35.7 s
Wall time: 38.5 s


# Random Forest

In [8]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 30, stop = 70, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(32, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [9]:
%%time

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 10,
                               scoring = score_dict,
                               refit= 'deceased_f1',
                               cv = 3, 
                               verbose=2, 
                               n_jobs = 4)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 120.7min finished


CPU times: user 3min 37s, sys: 14.9 s, total: 3min 52s
Wall time: 2h 4min 28s


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=4,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [32, 39, 47, 55, 63, 71,
                                                      78, 86, 94, 102, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [30, 40, 50, 60, 70]},
                   refit='deceased_f1',
                   scoring={'deceased_f1': make_scorer(f1_score, average=micro, labels=['deceased']),
                            'deceased_recall': make_scorer(recall_score, average=micro, labels=['deceased']),
                            'overall_accuracy': make_scorer(accuracy_score),
                    

In [10]:
pickle.dump(rf_random, open( "rf_rs_deceased20.pkl", "wb" ) )

In [45]:
randomForestprediction = list(rf_random.best_estimator_.predict(X_test))
randomForestConfusionMatrix = confusion_matrix(y_test, randomForestprediction, labels=["recovered","nonhospitalized","hospitalized","deceased"])
cm_rf = pd.DataFrame(randomForestConfusionMatrix).rename(index = {0:"recovered", 1: "nonhospitalized", 2: "hospitalized", 3: "deceased"}, columns = {0:"recovered", 1: "nonhospitalized", 2: "hospitalized", 3: "deceased"})
cm_rf

Unnamed: 0,recovered,nonhospitalized,hospitalized,deceased
recovered,34692,514,17813,8677
nonhospitalized,404,103495,21,1079
hospitalized,18117,5,59759,9619
deceased,13049,266,21478,31356


In [None]:
(34692+103495+59759+31356)/

In [46]:
print(classification_report(y_test, randomForestprediction))

                 precision    recall  f1-score   support

       deceased       0.62      0.47      0.54     66149
   hospitalized       0.60      0.68      0.64     87500
nonhospitalized       0.99      0.99      0.99    104999
      recovered       0.52      0.56      0.54     61696

       accuracy                           0.72    320344
      macro avg       0.68      0.68      0.68    320344
   weighted avg       0.72      0.72      0.71    320344



In [14]:
f1 = f1_score(y_test,randomForestprediction,labels=["deceased"],average='micro')
recall = recall_score(y_test,randomForestprediction,labels=["deceased"],average='micro')
precision = precision_score(y_test,randomForestprediction,labels=["deceased"],average='micro')
print(f'f1score:{f1}, precision:{precision}, recall:{recall}')

f1score:0.5350234685640827, precision:0.615025087634889, recall:0.47343915343915344


In [41]:
hyperparameters = rf_random.cv_results_['params']
deceased_f1 = rf_random.cv_results_['mean_test_deceased_f1']
deceased_recall = rf_random.cv_results_['mean_test_deceased_recall']
overall_acc = rf_random.cv_results_['mean_test_overall_accuracy']
overall_recall = rf_random.cv_results_['mean_test_overall_recall']

In [63]:
pd.DataFrame({'Hyperparameter': hyperparameters, 
              "F1-Score on 'deceased' ": deceased_f1,
              "Recall on 'deceased' ": deceased_recall,
              "Overall Accuracy": overall_acc,
              "Overall Recall": overall_recall} ).sort_values("F1-Score on 'deceased' ",ascending=False)

Unnamed: 0,Hyperparameter,F1-Score on 'deceased',Recall on 'deceased',Overall Accuracy,Overall Recall
3,"{'n_estimators': 30, 'min_samples_split': 2, '...",0.5317,0.465707,0.71198,0.71198
4,"{'n_estimators': 50, 'min_samples_split': 2, '...",0.531404,0.465496,0.711969,0.711969
1,"{'n_estimators': 70, 'min_samples_split': 2, '...",0.530672,0.463485,0.712111,0.712111
6,"{'n_estimators': 40, 'min_samples_split': 2, '...",0.529981,0.462599,0.711849,0.711849
0,"{'n_estimators': 40, 'min_samples_split': 2, '...",0.529285,0.460231,0.711997,0.711997
5,"{'n_estimators': 60, 'min_samples_split': 10, ...",0.513669,0.435125,0.706987,0.706987
9,"{'n_estimators': 70, 'min_samples_split': 2, '...",0.5125,0.435218,0.706515,0.706515
7,"{'n_estimators': 30, 'min_samples_split': 2, '...",0.507146,0.424318,0.705029,0.705029
2,"{'n_estimators': 60, 'min_samples_split': 10, ...",0.502128,0.422929,0.700653,0.700653
8,"{'n_estimators': 50, 'min_samples_split': 10, ...",0.493554,0.408471,0.698271,0.698271


# XGBoost

In [22]:
X = res
y = df['outcome'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y)

In [23]:
%%time
clf_xgb = xgb.XGBClassifier()
param_dist = {'n_estimators': [int(x) for x in np.linspace(start = 20, stop = 50, num = 5)],
              'learning_rate': stats.uniform(0.01, 1),
              'subsample': stats.uniform(0.3, 0.9),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],              
              'min_child_weight': [1, 2, 3, 4]
             }

xgb_rs = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = 3,  
                         n_iter = 10,
                         scoring = score_dict,
                         refit = 'deceased_f1',
                         verbose = 3, 
                         n_jobs = 3)
xgb_rs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


KeyboardInterrupt: 

In [24]:
pickle.dump(xgb_rs, open( "xgb_rs_deceased20.pkl", "wb" ) )

In [None]:
xgb_rs.best_params_

In [None]:
XGprediction = list(xgb_rs.best_estimator_.predict(X_test))
xgConfusionMatrix = confusion_matrix(y_test, XGprediction, labels=["recovered","nonhospitalized","hospitalized","deceased"])
cm_xg = pd.DataFrame(xgConfusionMatrix).rename(index = {0:"recovered", 1: "nonhospitalized", 2: "hospitalized", 3: "deceased"}, columns = {0:"recovered", 1: "nonhospitalized", 2: "hospitalized", 3: "deceased"})
cm_xg

In [None]:
print(classification_report(y_test, XGprediction))

In [None]:
accuracy_score(y_test,XGprediction)

In [None]:
f1 = f1_score(y_train,XGprediction,labels=["deceased"],average='micro')
recall = recall_score(y_train,XGprediction,labels=["deceased"],average='micro')
precision = precision_score(y_train,XGprediction,labels=["deceased"],average='micro')
print(f'XGBoost \nf1score:{f1}, precision:{precision}, recall:{recall}')

# AdaBoost

In [64]:
X = res
y = df['outcome'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [65]:
%%time
clf_ada = AdaBoostClassifier(n_estimators=10, random_state=0)
param_dist = {'n_estimators': [int(x) for x in np.linspace(start = 20, stop = 65, num = 5)],
              'learning_rate': stats.uniform(0.01, 1),              
             }

ada_rs = RandomizedSearchCV(clf_ada, 
                         param_distributions = param_dist,
                         cv = 3,  
                         n_iter = 10,
                         scoring = score_dict,
                         refit = 'deceased_f1', 
                         verbose = 3,
                         n_jobs = 4)
ada_rs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [66]:
pickle.dump(ada_rs, open( "ada_rs_deceased20.pkl", "wb" ) )

In [None]:
Adaprediction = list(ada_rs.best_estimator_.predict(X_test))
AdaConfusionMatrix = confusion_matrix(y_test, Adaprediction, labels=["recovered","nonhospitalized","hospitalized","deceased"])
cm_ada = pd.DataFrame(AdaConfusionMatrix).rename(index = {0:"recovered", 1: "nonhospitalized", 2: "hospitalized", 3: "deceased"}, columns = {0:"recovered", 1: "nonhospitalized", 2: "hospitalized", 3: "deceased"})
cm_ada

In [None]:
print(classification_report(y_test, randomForestprediction))

In [None]:
accuracy_score(y_test,Adaprediction)

In [None]:
f1 = f1_score(y_train,Adaprediction,labels=["deceased"],average='micro')
recall = recall_score(y_train,Adaprediction,labels=["deceased"],average='micro')
precision = precision_score(y_train,Adaprediction,labels=["deceased"],average='micro')
print(f'Adaboost \nf1score:{f1}, precision:{precision}, recall:{recall}')