In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate
import matplotlib.pylab as plt
from matplotlib.pyplot import figure
import seaborn as sns
import datetime

# Prepare Data 

In [None]:
def load_data():
    df_train = pd.read_csv('data/corona_tested_individuals_ver_006.english_cleaned.csv')
    df_train.drop('Unnamed: 0', axis=1, inplace=True)
    df_train['test_date'] = pd.to_datetime(df_train['test_date'], format='%Y-%m-%d')
    df_train = df_train.set_index('test_date')
    df_train = df_train.rename_axis(index=None, axis=1)
    df_train.astype(int)
    df_train_0322_0331 = df_train.truncate(before=pd.Timestamp('2020-03-22'), after=pd.Timestamp('2020-03-31'))
    df_test_0401_0407 = df_train.truncate(before=pd.Timestamp('2020-04-01'), after=pd.Timestamp('2020-04-07'))
    var_col = [c for c in df_train if c not in ['corona_result']]
    X_train = df_train_0322_0331.loc[:, var_col]
    y_train = df_train_0322_0331.loc[:, 'corona_result']
    X_test = df_test_0401_0407.loc[:, var_col]
    y_test = df_test_0401_0407.loc[:, 'corona_result']
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = load_data()

# Train and Evaluate

In [None]:
names = ['SVM', 'Random Forest', 'XGBoost', 'KNN', 'Logistic Regression', 'Decision Tree', 'Naive Bayes']
models = []
models.append(SVC(class_weight='balanced', probability=True, random_state=42))
models.append(RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42))
models.append(XGBClassifier(scale_pos_weight=y_train.value_counts()[0]/y_train.value_counts()[1], n_jobs=-1, random_state=42))
models.append(KNeighborsClassifier(n_jobs=-1))
models.append(LogisticRegression(n_jobs=-1, random_state=42))
models.append(DecisionTreeClassifier(random_state=42))
models.append(GaussianNB())

In [None]:
def sens_func(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    return tp/(tp+fn)
def spec_func(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    return tn/(tn+fp)
sensitivity_scorer = make_scorer(sens_func)
specificity_scorer = make_scorer(spec_func)

In [None]:
def evalModel(clf, X_train, y_train, X_test, y_test):
    print('training & evaluating: {}'.format(clf))
    
    scoring = {'sensitivity': sensitivity_scorer,
               'specificity': specificity_scorer,
               'accuracy': 'accuracy',
               'precision': 'precision',
               'roc_auc': 'roc_auc'}
    scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=5)
    sensitivity = round(scores['test_sensitivity'].mean()*100, 2)
    specificity = round(scores['test_specificity'].mean()*100, 2)
    accuracy = round(scores['test_accuracy'].mean()*100, 2)
    precision = round(scores['test_precision'].mean()*100, 2)
    ROC = round(scores['test_roc_auc'].mean()*100, 2)
    train_scores = [sensitivity, specificity, accuracy, precision, ROC]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    sensitivity = round(tp/(tp+fn)*100, 2)
    specificity = round(tn/(tn+fp)*100, 2)
    accuracy = round(accuracy_score(y_test,y_pred)*100, 2)
    precision = round(precision_score(y_test,y_pred)*100, 2)
    probs = clf.predict_proba(X_test)
    prob = probs[:, 1]
    ROC = round(roc_auc_score(y_test, prob)*100, 2)
    test_scores = [sensitivity, specificity, accuracy, precision, ROC]
    
    return train_scores, test_scores, cm

In [None]:
train_scores_list = []
test_scores_list = []
for model in models:
    train_scores, test_scores, cm = evalModel(model, X_train, y_train, X_test, y_test)
    train_scores_list.append(train_scores)
    test_scores_list.append(test_scores)
df_train = pd.DataFrame(train_scores_list, columns=['Sensitivity(%)', 'Specificity(%)', 'Accuracy(%)', 'Precision(%)', 'ROC(%)'], index = names)
df_test = pd.DataFrame(test_scores_list, columns=['Sensitivity(%)', 'Specificity(%)', 'Accuracy(%)', 'Precision(%)', 'ROC(%)'], index = names)
df_train.to_csv('csv/df_train_default.csv')
df_test.to_csv('csv/df_test_default.csv')

# GridSearchCV

In [None]:
def gs_evalModel(clf, X_train, y_train, X_test, y_test, grid_values, scoring):
    if grid_values != None:
        grid_clf = GridSearchCV(clf, param_grid = grid_values, scoring = scoring, n_jobs = -1)
        print('training: {}'.format(grid_clf))
        grid_clf.fit(X_train, y_train)
        best_params = grid_clf.best_params_
        print('Best Parameters: {}'.format(best_params))
    else:
        grid_clf = clf
        best_params = None
        print('training: {}'.format(grid_clf))
    
    scoring_cv = {'sensitivity': sensitivity_scorer,
                  'specificity': specificity_scorer,
                  'accuracy': 'accuracy',
                  'precision': 'precision',
                  'roc_auc': 'roc_auc'}
    scores = cross_validate(grid_clf, X_train, y_train, scoring=scoring_cv, cv=5)
    sensitivity = round(scores['test_sensitivity'].mean()*100, 2)
    specificity = round(scores['test_specificity'].mean()*100, 2)
    accuracy = round(scores['test_accuracy'].mean()*100, 2)
    precision = round(scores['test_precision'].mean()*100, 2)
    ROC = round(scores['test_roc_auc'].mean()*100, 2)
    train_scores = [sensitivity, specificity, accuracy, precision, ROC]
    
    y_pred = grid_clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    sensitivity = round(tp/(tp+fn)*100, 2)
    specificity = round(tn/(tn+fp)*100, 2)
    accuracy = round(accuracy_score(y_test,y_pred)*100, 2)
    precision = round(precision_score(y_test,y_pred)*100, 2)
    probs = grid_clf.predict_proba(X_test)
    prob = probs[:, 1]
    ROC = round(roc_auc_score(y_test, prob)*100, 2)
    test_scores = [sensitivity, specificity, accuracy, precision, ROC]
    
    return train_scores, test_scores, best_params, cm, grid_clf

## For Accuracy

In [None]:
train_scores_list_gs = []
test_scores_list_gs = []
best_params_list = []
best_models_list = []
grid_values_list = [{'C': [1.0, 0.1, 0.01]}, 
                    {'n_estimators': [100, 200, 400, 600]}, 
                    {'n_estimators': [100, 200, 400, 600], 'learning_rate': [None, 1e-1, 1e-2]},
                    {'leaf_size': [10, 20, 30]}, 
                    {'C': [1.0, 0.1, 0.01]}, 
                    None, 
                    {'var_smoothing': [1e-3, 1e-4, 1e-5]}]
for model, grid_values in zip(models, grid_values_list):
    train_scores, test_scores, best_params, cm, best_model = gs_evalModel(model, X_train, y_train, X_test, y_test, grid_values, 'accuracy')
    train_scores_list_gs.append(train_scores)
    test_scores_list_gs.append(test_scores)
    best_params_list.append(best_params)
    best_models_list.append(best_model)
df_train_gs = pd.DataFrame(train_scores_list_gs, columns=['Sensitivity(%)', 'Specificity(%)', 'Accuracy(%)', 'Precision(%)', 'ROC(%)'], index = names)
df_test_gs = pd.DataFrame(test_scores_list_gs, columns=['Sensitivity(%)', 'Specificity(%)', 'Accuracy(%)', 'Precision(%)', 'ROC(%)'], index = names)
df_train_gs.to_csv('csv/df_train_gs_acc.csv')
df_test_gs.to_csv('csv/df_test_gs_acc.csv')

### Best Parameters for Accuracy
- SVM: {'C': 1.0}
- Random Forest: {'n_estimators': 400}
- XGBoost: {'n_estimators': 400, 'learning_rate': 1e-2}
- KNN: {'leaf_size': 30}
- Logistic Regression: {'C': 1.0} 
- Decision Tree: None
- Naive Bayes: {'var_smoothing': 1e-4}

***

### Further GridSearch for SVM

In [None]:
clf = SVC(probability=True, random_state=42)

In [None]:
grid_values = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}
grid_clf = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc', n_jobs = -1)
print('training: {}'.format(grid_clf))
grid_clf.fit(X_train, y_train)
best_params = grid_clf.best_params_
print('Best Parameters: {}'.format(best_params))

In [None]:
scoring_cv = {'sensitivity': sensitivity_scorer,
                  'specificity': specificity_scorer,
                  'accuracy': 'accuracy',
                  'precision': 'precision',
                  'roc_auc': 'roc_auc'}
scores = cross_validate(grid_clf, X_train, y_train, scoring=scoring_cv, cv=5)
sensitivity = round(scores['test_sensitivity'].mean()*100, 2)
specificity = round(scores['test_specificity'].mean()*100, 2)
accuracy = round(scores['test_accuracy'].mean()*100, 2)
precision = round(scores['test_precision'].mean()*100, 2)
ROC = round(scores['test_roc_auc'].mean()*100, 2)
train_scores = [sensitivity, specificity, accuracy, precision, ROC]

In [None]:
y_pred = grid_clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensitivity = round(tp/(tp+fn)*100, 2)
specificity = round(tn/(tn+fp)*100, 2)
accuracy = round(accuracy_score(y_test,y_pred)*100, 2)
precision = round(precision_score(y_test,y_pred)*100, 2)
probs = grid_clf.predict_proba(X_test)
prob = probs[:, 1]
ROC = round(roc_auc_score(y_test, prob)*100, 2)
test_scores = [sensitivity, specificity, accuracy, precision, ROC]

- C:0.1, gamma:0.001

***

## For ROC_AUC

In [None]:
train_scores_list_gs = []
test_scores_list_gs = []
best_params_list = []
best_models_list = []
grid_values_list = [{'C': [1.0, 0.1, 0.01]}, 
                    {'n_estimators': [100, 200, 400, 600]}, 
                    {'n_estimators': [100, 200, 400, 600], 'learning_rate': [None, 1e-1, 1e-2]},
                    {'leaf_size': [10, 20, 30]}, 
                    {'C': [1.0, 0.1, 0.01]}, 
                    None, 
                    {'var_smoothing': [1e-3, 1e-4, 1e-5]}]
for model, grid_values in zip(models, grid_values_list):
    train_scores, test_scores, best_params, cm, best_model = gs_evalModel(model, X_train, y_train, X_test, y_test, grid_values, 'roc_auc')
    train_scores_list_gs.append(train_scores)
    test_scores_list_gs.append(test_scores)
    best_params_list.append(best_params)
    best_models_list.append(best_model)
df_train_gs = pd.DataFrame(train_scores_list_gs, columns=['Sensitivity(%)', 'Specificity(%)', 'Accuracy(%)', 'Precision(%)', 'ROC(%)'], index = names)
df_test_gs = pd.DataFrame(test_scores_list_gs, columns=['Sensitivity(%)', 'Specificity(%)', 'Accuracy(%)', 'Precision(%)', 'ROC(%)'], index = names)
df_train_gs.to_csv('csv/df_train_gs_roc_auc.csv')
df_test_gs.to_csv('csv/df_test_gs_roc_auc.csv')