In [None]:
import sys
sys.path.append('../src') 

# Importing libraries
import pandas as pd
import numpy as np

# Libraries for machine learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# Libraries for plotting curves
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from itertools import cycle

# Importing script
import etl as etl
import visualize_data as visualize_data

import warnings
warnings.filterwarnings('ignore')

## Disease Risk Classification Model

The purpose of this notebook is to explore a variety of models and tune them in order to figure out which is best for disease risk classification. According to the article "Comparing different supervised machine learning algorithms for disease prediction" by Shahadat Uddin et al. (https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-019-1004-8), Support Vector Machines, Naive Bayes, and Random Forest are some of the most common machine learning algorithms applied to disease prediction. We will explore these along with other algorithms such as Logistic Regression and K Nearest Neighbors in this notebook.

Let's get straight into it and simulate a data set of 5,000 individuals.

In [None]:
simulated_gwas_fp = '../testdata/coronary_artery/EFO_0001645.csv' 
etl.simulate_data('.', 'simulated_data', simulated_gwas_fp, 1000)
simulated_data = pd.read_csv('simulated_data.csv')
simulated_data.head()

We will not be using the simulated data above for building the model. Given that the class label (disease risk category) was assigned to an individual depending on the weighted sum of all and only the SNPs in this data set, it would be too easy for a machine learning model to figure this out. Essentially, the above data set is a simulated 'ground truth'. In machine learning problems you typically don't work with all the variables that determine the label so something needs to be done about this.

As a solution, we will be making classifications based on a subset of SNPs in the above data set. To do this, we will be using a separate GWAS to inform us of SNPs that are most important in predicting disease. Only those SNPs that are in the above data set and the new GWAS will be used. The model will then be trained on this subset.

Let's load in the other GWAS data and filter the above data.

In [None]:
model_gwas_fp = '../testdata/coronary_artery/EFO_0000378.csv' 
model_data = pd.read_csv(model_gwas_fp)
subset = set(simulated_data.columns).intersection(model_data['variant_id'].unique())
new_columns = list(subset)+['Class']

data = simulated_data[new_columns]

Now let's create a training and test set on the above data.

In [None]:
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

In [None]:
# get proportion of each class
prop_per_class = y.value_counts(normalize=True)
prop_per_class

Now we can train different models on these sets and assess their results.

**Note:**

>It is important to note that accuracy is not the only metric that is important here to assess the performance of the models. Since we are working with predicting the disease risk of individuals, **Type I** and **Type II** errors are also very important. It is potentially dangerous to classify an individual as low risk when they are actually high risk (Type II, False Negative). Additionally, classifying an individual as high risk when they are actually low risk could cause the individual some unecessary stress within themselves and their families (Type I, False Positive). Therefore, it is important to control for such errors in our model. To do so, we will prioritize the maximization of **Recall** (TP/TP+FN) since it is an indicator of the dangerous False Negatives in our model but at the same time attempt to maximize **Precision** (TP/TP+FP) since it is an indicator of False Positives. 

>In general:
    - Accuracy is a great metric only when you have symmetric datasets (meaning false negatives & false positives counts are close) and when false negatives & false positives have similar costs. If the cost of false positives and false negatives are different, then F1-score is the best option. F1-score is best if you have an uneven class distribution.
    - Recall/Sensitivity is the best metric if the idea of false positives is far better than false negatives. In other words, if the occurrence of false negatives is unaccepted/intolerable, then you'd rather get some extra false positives (false alarms) over saving some false negatives, like in the diabetes example. In that example, you'd rather get some healthy people labeled diabetic over leaving a diabetic person labeled healthy.
    - Precision is the best metric if you want to be more confident of your true positives. For example, with spam emails, you'd rather have some spam emails in your inbox rather than some regular emails in your spam box. So, the email company wants to be extra sure that email Y is spam before they put it in the spam box and you never get to see it.
    - Specificity is the best metric if you want to cover all true negatives, meaning you don't want any false alarms (false positives). For example, if you're running a drug test in which all people who test positive will immediately go to jail, then you don't want anyone drug-free going to jail. In this example, false positives are intolerable.

### Logistic Regression

In [None]:
lg = LogisticRegression()
lg.fit(X_train, y_train)

preds_lg = lg.predict(X_test)
accuracy_lg = lg.score(X_test, y_test)*100
print('The accuracy for the Logistic Regression model is {}%'.format(accuracy_lg))

In [None]:
f1_lr = f1_score(y_test.values, preds_lg, average='weighted')
print('f1-score:', f1_lr, 'with average: weighted')

In [None]:
cnf_matrix_lr = confusion_matrix(y_test.values, preds_lg)

FP = cnf_matrix_lr.sum(axis=0) - np.diag(cnf_matrix_lr)  
FN = cnf_matrix_lr.sum(axis=1) - np.diag(cnf_matrix_lr)
TP = np.diag(cnf_matrix_lr)
TN = cnf_matrix_lr.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_lr, tnr_medrisk_lr, tnr_lowrisk_lr = TNR
print(tnr_highrisk_lr, tnr_medrisk_lr, tnr_lowrisk_lr)

tnr_lr = np.mean(TNR)
print('average specificity:', tnr_lr)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

Let's attempt to improve the model using grid search.

In [None]:
lg_parameters = {'tol':[.1, .001, .0001], 'C':[10, 1, .1]}
lg = LogisticRegression()
clf1 = GridSearchCV(lg, lg_parameters)

clf1.fit(X_train, y_train)
preds_lr = clf1.predict(X_test)
accuracy_lr = np.mean(y_test == preds_lr)*100


print('The accuracy for the refined Logistic Regression model is {}%'.format(accuracy_lr))

In [None]:
# print(clf1.best_params_)
# print(clf1.best_score_)
# display(pd.DataFrame.from_dict(clf1.cv_results_).sort_values('rank_test_score'))

In [None]:
target_names = ['Low Risk', 'Medium Risk', 'High Risk']
print(classification_report(y_test, preds_lr, target_names=target_names))

##### K-Fold Cross-Validation

In [None]:
lr_cv = LogisticRegression()

cv_scores_LR = cross_val_score(lr_cv, X, y, cv=5)

print('Cross-Validation Scores: ' + str(cv_scores_LR))
print('Mean of Cross-Validation Scores: {}%'.format(np.mean(cv_scores_LR)*100))

##### Tuning Model Hyperparameters

In [None]:
lr = LogisticRegression()

params = {'tol':[.1, .0001, 1e-5], 'C':[10, 1, .1]}

lr_gscv = GridSearchCV(lr, params, cv=5)

lr_gscv.fit(X_train, y_train)
preds_lr = lr_gscv.predict(X_test)
accuracy_lr_gscv = np.mean(y_test == preds_lr)*100

print('The accuracy for the refined Logistic Regression model is {}%'.format(accuracy_lr_gscv))

# NOTE: the following commented-out code is to print
# what the best parameter values are and 
# the mean cross-validated score of the best estimator
print('The best parameters are:')
for key, val in lr_gscv.best_params_.items():
    print(str(key) + ':', val)
# print('\nThe best score for the refined Logistic Regression model is {}%'.format(lr_gscv.best_score_*100))

In [None]:
f1_lr_gscv = f1_score(y_test.values, preds_lr, average='weighted')
print('f1-score with grid search:', f1_lr_gscv, 'with average: weighted')

In [None]:
cnf_matrix_lr_gscv = confusion_matrix(y_test.values, preds_lr)

FP = cnf_matrix_lr_gscv.sum(axis=0) - np.diag(cnf_matrix_lr_gscv)  
FN = cnf_matrix_lr_gscv.sum(axis=1) - np.diag(cnf_matrix_lr_gscv)
TP = np.diag(cnf_matrix_lr_gscv)
TN = cnf_matrix_lr_gscv.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_lr_gscv, tnr_medrisk_lr_gscv, tnr_lowrisk_lr_gscv = TNR
print(tnr_highrisk_lr_gscv, tnr_medrisk_lr_gscv, tnr_lowrisk_lr_gscv)

tnr_lr_gscv = np.mean(TNR)
print('average specificity:', tnr_lr_gscv)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

##### Plotting ROC and P-R Curves

In [None]:
# fit a model with the best parameters
lr_best = LogisticRegression(C=10, tol=0.0001)
lr_best.fit(X_train, y_train)

# plot multiclass P-R curve
lr_pr = visualize_data.plot_precision_recall(
    'Logistic Regression', lr_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
lr_best = LogisticRegression(C=10, tol=0.0001)
lr_best.fit(X_train, y_train)

# plot multiclass ROC curve
lr_roc = visualize_data.plot_multiclass_roc(
    'Logistic Regression', lr_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

### K Nearest Neighbors

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

preds_knn = knn.predict(X_test)
accuracy_knn = knn.score(X_test, y_test)*100
print('The accuracy for the K Nearest Neighbors model is {}%'.format(accuracy_knn))

In [None]:
f1_knn = f1_score(y_test.values, preds_knn, average='weighted')
print('f1-score:', f1_lr, 'with average: weighted')

In [None]:
cnf_matrix_knn = confusion_matrix(y_test.values, preds_knn)

FP = cnf_matrix_knn.sum(axis=0) - np.diag(cnf_matrix_knn)  
FN = cnf_matrix_knn.sum(axis=1) - np.diag(cnf_matrix_knn)
TP = np.diag(cnf_matrix_knn)
TN = cnf_matrix_knn.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_knn, tnr_medrisk_knn, tnr_lowrisk_knn = TNR
print(tnr_highrisk_knn, tnr_medrisk_knn, tnr_lowrisk_knn)

tnr_knn = np.mean(TNR)
print('average specificity:', tnr_knn)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

Let's attempt to improve the model using grid search.

In [None]:
knn_parameters = {'n_neighbors':[10, 5, 3, 1], 'p':[2, 1]}
knn = KNeighborsClassifier()
clf2 = GridSearchCV(knn, knn_parameters)

clf2.fit(X_train, y_train)
preds_knn = clf2.predict(X_test)
accuracy_knn2 = np.mean(y_test == preds_knn)*100

print('The accuracy for the refined K Nearest Neighbors model is {}%'.format(accuracy_knn2))

In [None]:
print(classification_report(y_test, preds_knn, target_names=target_names))

##### K-Fold Cross-Validation

In [None]:
knn_cv = KNeighborsClassifier()

cv_scores = cross_val_score(knn_cv, X, y, cv=5)

print('Cross-Validation Scores: ' + str(cv_scores))
print('Mean of Cross-Validation Scores: {}%'.format(np.mean(cv_scores)*100))

##### Tuning Model Hyperparameters

In [None]:
knn = KNeighborsClassifier()

params = {'n_neighbors': [10, 5, 3, 1], 'p':[3, 2, 1]}

knn_gscv = GridSearchCV(knn, params, cv=5)

knn_gscv.fit(X_train, y_train)
preds_knn = knn_gscv.predict(X_test)
accuracy_knn_gscv = np.mean(y_test == preds_knn)*100

print('The accuracy for the refined K Nearest Neighbors model is {}%'.format(accuracy_knn_gscv))

# NOTE: the following commented-out code is to print
# what the best parameter values are and 
# the mean cross-validated score of the best estimator
print('The best parameters are:')
for key, val in knn_gscv.best_params_.items():
    print(str(key) + ':', val)
# print('\nThe best score for the refined K Nearest Neighbors model is {}%'.format(knn_gscv.best_score_*100))

In [None]:
f1_knn_gscv = f1_score(y_test.values, preds_knn, average='weighted')
print('f1-score with grid search:', f1_knn_gscv, 'with average: weighted')

In [None]:
cnf_matrix_knn_gscv = confusion_matrix(y_test.values, preds_knn)

FP = cnf_matrix_knn_gscv.sum(axis=0) - np.diag(cnf_matrix_knn_gscv)  
FN = cnf_matrix_knn_gscv.sum(axis=1) - np.diag(cnf_matrix_knn_gscv)
TP = np.diag(cnf_matrix_knn_gscv)
TN = cnf_matrix_knn_gscv.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_knn_gscv, tnr_medrisk_knn_gscv, tnr_lowrisk_knn_gscv = TNR
print(tnr_highrisk_knn_gscv, tnr_medrisk_knn_gscv, tnr_lowrisk_knn_gscv)

tnr_knn_gscv = np.mean(TNR)
print('average specificity:', tnr_knn_gscv)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

##### Plotting ROC and P-R Curves

In [None]:
# fit a model with the best parameters
knn_best = KNeighborsClassifier(n_neighbors=3, p=3)
knn_best.fit(X_train, y_train)

# plot multiclass P-R curve
visualize_data.plot_precision_recall(
    'K Nearest Neighbors', knn_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
knn_best = KNeighborsClassifier(n_neighbors=3, p=3)
knn_best.fit(X_train, y_train)

# plot multiclass ROC curve
visualize_data.plot_multiclass_roc(
    'K Nearest Neighbors', knn_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

### Support Vector Machine

In [None]:
svc = SVC()
svc.fit(X_train, y_train)

preds_svc = svc.predict(X_test)
accuracy_svc = svc.score(X_test, y_test)*100
print('The accuracy for the Support Vector Machine model is {}%'.format(accuracy_svc))

In [None]:
f1_svc = f1_score(y_test.values, preds_svc, average='weighted')
print('f1-score:', f1_svc, 'with average: weighted')

In [None]:
cnf_matrix_svc = confusion_matrix(y_test.values, preds_svc)

FP = cnf_matrix_svc.sum(axis=0) - np.diag(cnf_matrix_svc)  
FN = cnf_matrix_svc.sum(axis=1) - np.diag(cnf_matrix_svc)
TP = np.diag(cnf_matrix_svc)
TN = cnf_matrix_svc.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_svc, tnr_medrisk_svc, tnr_lowrisk_svc = TNR
print(tnr_highrisk_svc, tnr_medrisk_svc, tnr_lowrisk_svc)

tnr_svc = np.mean(TNR)
print('average specificity:', tnr_svc)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

In [None]:
svc_parameters = {'tol': [.1, .001, .0001], 'C':[10, 1, .1]}
svc = SVC()
clf3 = GridSearchCV(svc, svc_parameters)

clf3.fit(X_train, y_train)
preds_svc = clf3.predict(X_test)
accuracy_svc2 = np.mean(y_test == preds_svc)*100

print('The accuracy for the refined Support Vector Machine model is {}%'.format(accuracy_svc2))

In [None]:
print(classification_report(y_test, preds_svc, target_names=target_names))

##### K-Fold Cross-Validation

In [None]:
svc_cv = SVC()

cv_scores = cross_val_score(svc_cv, X, y, cv=5)

print('Cross-Validation Scores: ' + str(cv_scores))
print('Mean of Cross-Validation Scores: {}%'.format(np.mean(cv_scores)*100))

##### Tuning Model Hyperparameters

In [None]:
svc = SVC()
params = {'tol': [.1, .001, .0001], 'C':[10, 1, .1]}
svc_gscv = GridSearchCV(svc, params, cv=5)

svc_gscv.fit(X_train, y_train)
preds_svc = svc_gscv.predict(X_test)
accuracy_svc_gscv = np.mean(y_test == preds_svc)*100

print('The accuracy for the refined Support Vector Machine model is {}%'.format(accuracy_svc_gscv))

# NOTE: the following commented-out code is to print
# what the best parameter values are and 
# the mean cross-validated score of the best estimator
print('The best parameters are:')
for key, val in svc_gscv.best_params_.items():
    print(str(key) + ':', val)
# print('\nThe best score for the refined Support Vector Machine model is {}%'.format(svc_gscv.best_score_*100))

In [None]:
f1_svc_gscv = f1_score(y_test.values, preds_svc, average='weighted')
print('f1-score with grid search:', f1_svc_gscv, 'with average: weighted')

In [None]:
cnf_matrix_svc_gscv = confusion_matrix(y_test.values, preds_svc)

FP = cnf_matrix_svc_gscv.sum(axis=0) - np.diag(cnf_matrix_svc_gscv)  
FN = cnf_matrix_svc_gscv.sum(axis=1) - np.diag(cnf_matrix_svc_gscv)
TP = np.diag(cnf_matrix_svc_gscv)
TN = cnf_matrix_svc_gscv.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_svc_gscv, tnr_medrisk_svc_gscv, tnr_lowrisk_svc_gscv = TNR
print(tnr_highrisk_svc_gscv, tnr_medrisk_svc_gscv, tnr_lowrisk_svc_gscv)

tnr_svc_gscv = np.mean(TNR)
print('average specificity:', tnr_svc_gscv)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

##### Plotting ROC and P-R Curves

In [None]:
# fit a model with the best parameters
svc_best = SVC(C=10, tol=0.1)
svc_best.fit(X_train, y_train)

# plot multiclass P-R curve
visualize_data.plot_precision_recall(
    'Support Vector Machine', svc_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
svc_best = SVC(C=10, tol=0.1)
svc_best.fit(X_train, y_train)

# plot multiclass ROC curve
visualize_data.plot_multiclass_roc(
    'Support Vector Machine', svc_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

### Naive Bayes

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

preds_gnb = gnb.predict(X_test)
accuracy_gnb = gnb.score(X_test, y_test)*100
print('The accuracy for the Naive Bayes is {}%'.format(accuracy_gnb))

In [None]:
f1_gnb = f1_score(y_test.values, preds_gnb, average='weighted')
print('f1-score:', f1_gnb, 'with average: weighted')

In [None]:
cnf_matrix_gnb = confusion_matrix(y_test.values, preds_gnb)

FP = cnf_matrix_gnb.sum(axis=0) - np.diag(cnf_matrix_gnb)  
FN = cnf_matrix_gnb.sum(axis=1) - np.diag(cnf_matrix_gnb)
TP = np.diag(cnf_matrix_gnb)
TN = cnf_matrix_gnb.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_gnb, tnr_medrisk_gnb, tnr_lowrisk_gnb = TNR
print(tnr_highrisk_gnb, tnr_medrisk_gnb, tnr_lowrisk_gnb)

tnr_gnb = np.mean(TNR)
print('average specificity:', tnr_gnb)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

In [None]:
gnb.get_params()

In [None]:
gnb_parameters = {'var_smoothing':[1e-3, 1e-6, 1e-9]}
gnb = GaussianNB()
clf4 = GridSearchCV(gnb, gnb_parameters)

clf4.fit(X_train, y_train)
preds_gnb = clf4.predict(X_test)
accuracy_gnb2 = np.mean(y_test == preds_gnb)*100

print('The accuracy for the refined Naive Bayes model is {}%'.format(accuracy_gnb2))

In [None]:
print(classification_report(y_test, preds_gnb, target_names=target_names))

##### K-Fold Cross-Validation

In [None]:
# perform k-fold cross-validation
gnb_cv = GaussianNB()

%time cv_scores = cross_val_score(gnb_cv, X, y, cv=5)

print('Cross-Validation Scores: ' + str(cv_scores))
print('Mean of Cross-Validation Scores: {}%'.format(np.mean(cv_scores)*100))

##### Tuning Model Hyperparameters

In [None]:
# perform hyperparameter tuning and output the best params and score
gnb = GaussianNB()

# the 'priors' values come from the following:
# - the first list of values is just indicating 
# equal weights between the 3 classes, which are 0, 1, and 2
# - the second list of value refers to the weights
# of the classes that we provided initially
params = {#'priors': [[0.333, 0.333, 0.334], [0.55, 0.30, 0.15]],
          'var_smoothing': [0,1e-3,1e-6, 1e-9,0.01,0.1,0.5,1]}

gnb_gscv = GridSearchCV(gnb, params, cv=5)

gnb_gscv.fit(X_train, y_train)

preds_gnb = gnb_gscv.predict(X_test)
accuracy_gnb_gscv = np.mean(y_test == preds_gnb)*100

print('The accuracy for the refined Naive Bayes model is {}%'.format(accuracy_gnb_gscv))

# NOTE: the following commented-out code is to print
# what the best parameter values are and 
# the mean cross-validated score of the best estimator
print('The best parameters are:')
for key, val in gnb_gscv.best_params_.items():
    print(str(key) + ':', val)
# print('\nThe best score for the refined Naive Bayes model is {}%'.format(gnb_gscv.best_score_*100))

In [None]:
f1_gnb_gscv = f1_score(y_test.values, preds_gnb, average='weighted')
print('f1-score with grid search:', f1_gnb_gscv, 'with average: weighted')

In [None]:
cnf_matrix_gnb_gscv = confusion_matrix(y_test.values, preds_gnb)

FP = cnf_matrix_gnb_gscv.sum(axis=0) - np.diag(cnf_matrix_gnb_gscv)  
FN = cnf_matrix_gnb_gscv.sum(axis=1) - np.diag(cnf_matrix_gnb_gscv)
TP = np.diag(cnf_matrix_gnb_gscv)
TN = cnf_matrix_gnb_gscv.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_gnb_gscv, tnr_medrisk_gnb_gscv, tnr_lowrisk_gnb_gscv = TNR
print(tnr_highrisk_gnb_gscv, tnr_medrisk_gnb_gscv, tnr_lowrisk_gnb_gscv)

tnr_gnb_gscv = np.mean(TNR)
print('average specificity:', tnr_gnb_gscv)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

##### Plotting ROC and P-R Curves

In [None]:
# fit a model with the best parameters
gnb_best = GaussianNB(priors=[0.333, 0.333, 0.334], var_smoothing=0.1)
gnb_best.fit(X_train, y_train)

# plot multiclass P-R curve
visualize_data.plot_precision_recall(
    'Naive Bayes', gnb_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
gnb_best = GaussianNB(priors=[0.333, 0.333, 0.334], var_smoothing=0.1)
gnb_best.fit(X_train, y_train)

# plot multiclass ROC curve
visualize_data.plot_multiclass_roc(
    'Naive Bayes', gnb_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

### Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

preds_rf = rf.predict(X_test)
accuracy_rf = rf.score(X_test, y_test)*100
print('The accuracy for the Random Forest is {}%'.format(accuracy_rf))

In [None]:
f1_rf = f1_score(y_test.values, preds_rf, average='weighted')
print('f1-score:', f1_rf, 'with average: weighted')

In [None]:
cnf_matrix_rf = confusion_matrix(y_test.values, preds_rf)

FP = cnf_matrix_rf.sum(axis=0) - np.diag(cnf_matrix_rf)  
FN = cnf_matrix_rf.sum(axis=1) - np.diag(cnf_matrix_rf)
TP = np.diag(cnf_matrix_rf)
TN = cnf_matrix_rf.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_rf, tnr_medrisk_rf, tnr_lowrisk_rf = TNR
print(tnr_highrisk_rf, tnr_medrisk_rf, tnr_lowrisk_rf)

tnr_rf = np.mean(TNR)
print('average specificity:', tnr_rf)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

In [None]:
rf_parameters = {'n_estimators':[100, 50, 10]}
rf = RandomForestClassifier()
clf5 = GridSearchCV(rf, rf_parameters)

clf5.fit(X_train, y_train)
preds_rf = clf5.predict(X_test)
accuracy_rf2 = np.mean(y_test == preds_rf)*100

print('The accuracy for the refined Random Forest model is {}%'.format(accuracy_rf2))

In [None]:
print(classification_report(y_test, preds_rf, target_names=target_names))

##### K-Fold Cross-Validation

In [None]:
# perform k-fold cross-validation
rf_cv = RandomForestClassifier()

%time cv_scores = cross_val_score(rf_cv, X, y, cv=5)

print('Cross-Validation Scores: ' + str(cv_scores))
print('Mean of Cross-Validation Scores: {}%'.format(np.mean(cv_scores)*100))

##### Tuning Model Hyperparameters

In [None]:
# perform hyperparameter tuning and output the best params and score
rf = RandomForestClassifier()

# the 'priors' values come from the following:
# - the first list of values is the default, which just indicates 
# equal weights between the 3 classes, which are 0, 1, and 2
# - the second list of value refers to the weights
# of the classes that we provided initially
params = {#'class_weight': [{0: 1, 1: 1, 2: 1}, {0: 0.55, 1: 0.30, 2: 0.15}],
          'n_estimators':[200, 100, 50, 10]}

rf_gscv = GridSearchCV(rf, params, cv=5)

rf_gscv.fit(X_train, y_train)

preds_rf = rf_gscv.predict(X_test)
accuracy_rf_gscv = np.mean(y_test == preds_rf)*100

print('The accuracy for the refined Random Forest model is {}%'.format(accuracy_rf_gscv))

# NOTE: the following commented-out code is to print
# what the best parameter values are and 
# the mean cross-validated score of the best estimator
print('The best parameters are:')
for key, val in rf_gscv.best_params_.items():
    print(str(key) + ':', val)
# print('\nThe best score for the refined Random Forest model is {}%'.format(rf_gscv.best_score_*100))

In [None]:
f1_rf_gscv = f1_score(y_test.values, preds_rf, average='weighted')
print('f1-score with grid search:', f1_rf_gscv, 'with average: weighted')

In [None]:
cnf_matrix_rf_gscv = confusion_matrix(y_test.values, preds_rf)

FP = cnf_matrix_rf_gscv.sum(axis=0) - np.diag(cnf_matrix_rf_gscv)  
FN = cnf_matrix_rf_gscv.sum(axis=1) - np.diag(cnf_matrix_rf_gscv)
TP = np.diag(cnf_matrix_rf_gscv)
TN = cnf_matrix_rf_gscv.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_rf_gscv, tnr_medrisk_rf_gscv, tnr_lowrisk_rf_gscv = TNR
print(tnr_highrisk_rf_gscv, tnr_medrisk_rf_gscv, tnr_lowrisk_rf_gscv)

tnr_rf_gscv = np.mean(TNR)
print('average specificity:', tnr_rf_gscv)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

##### Plotting ROC and P-R Curves

In [None]:
# fit a model with the best parameters
rf_best = RandomForestClassifier(class_weight={0: 0.55, 1: 0.3, 2: 0.15}, 
                                 n_estimators=200)
rf_best.fit(X_train, y_train)

# plot multiclass P-R curve
visualize_data.plot_precision_recall(
    'Random Forest', rf_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
rf_best = RandomForestClassifier(class_weight={0: 0.55, 1: 0.3, 2: 0.15}, 
                                 n_estimators=200)
rf_best.fit(X_train, y_train)

# plot multiclass ROC curve
visualize_data.plot_multiclass_roc(
    'Random Forest', rf_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

### Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

preds_dt = dt.predict(X_test)
accuracy_dt = dt.score(X_test, y_test)*100
print('The accuracy for the Decision Tree model is {}%'.format(accuracy_dt))

In [None]:
f1_dt = f1_score(y_test.values, preds_dt, average='weighted')
print('f1-score:', f1_dt, 'with average: weighted')

In [None]:
cnf_matrix_dt = confusion_matrix(y_test.values, preds_dt)

FP = cnf_matrix_dt.sum(axis=0) - np.diag(cnf_matrix_dt)  
FN = cnf_matrix_dt.sum(axis=1) - np.diag(cnf_matrix_dt)
TP = np.diag(cnf_matrix_dt)
TN = cnf_matrix_dt.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_dt, tnr_medrisk_dt, tnr_lowrisk_dt = TNR
print(tnr_highrisk_dt, tnr_medrisk_dt, tnr_lowrisk_dt)

tnr_dt = np.mean(TNR)
print('average specificity:', tnr_dt)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

In [None]:
# perform hyperparameter tuning and output the best params and score
dt = DecisionTreeClassifier()
params = {'class_weight': [{0: 1, 1: 1, 2: 1}, {0: 0.55, 1: 0.30, 2: 0.15}]}

dt_gscv = GridSearchCV(dt, params, cv=5)

dt_gscv.fit(X_train, y_train)

preds_dt = dt_gscv.predict(X_test)
accuracy_dt_gscv = np.mean(y_test == preds_dt)*100

print('The accuracy for the refined Decision Tree model is {}%'.format(accuracy_dt_gscv))

# NOTE: the following commented-out code is to print
# what the best parameter values are and 
# the mean cross-validated score of the best estimator
print('The best parameters are:')
for key, val in dt_gscv.best_params_.items():
    print(str(key) + ':', val)
# print('\nThe best score for the refined Random Forest model is {}%'.format(rf_gscv.best_score_*100))

In [None]:
print(classification_report(y_test, preds_dt, target_names=target_names))

In [None]:
f1_dt_gscv = f1_score(y_test.values, preds_dt, average='weighted')
print('f1-score with grid search:', f1_dt_gscv, 'with average: weighted')

In [None]:
cnf_matrix_dt_gscv = confusion_matrix(y_test.values, preds_dt)

FP = cnf_matrix_dt_gscv.sum(axis=0) - np.diag(cnf_matrix_dt_gscv)  
FN = cnf_matrix_dt_gscv.sum(axis=1) - np.diag(cnf_matrix_dt_gscv)
TP = np.diag(cnf_matrix_dt_gscv)
TN = cnf_matrix_dt_gscv.sum() - (FP + FN + TP)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('metric: specificity, with value:', TNR)

tnr_highrisk_dt_gscv, tnr_medrisk_dt_gscv, tnr_lowrisk_dt_gscv = TNR
print(tnr_highrisk_dt_gscv, tnr_medrisk_dt_gscv, tnr_lowrisk_dt_gscv)

tnr_dt_gscv = np.mean(TNR)
print('average specificity:', tnr_dt_gscv)

# metrics = [TPR, TNR]
# for metric in metrics:
#     print('metric:', str(metric), 'with value:', metric)

##### Plotting ROC and P-R Curves

In [None]:
# fit a model with the best parameters
dt_best = DecisionTreeClassifier()
dt_best.fit(X_train, y_train)

# plot multiclass P-R curve
visualize_data.plot_precision_recall(
    'Decision Tree', dt_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

In [None]:
# fit a model with the best parameters
dt_best = DecisionTreeClassifier()
dt_best.fit(X_train, y_train)

# plot multiclass ROC curve
visualize_data.plot_multiclass_roc(
    'Decision Tree', dt_best, X_test, y_test, 
    n_classes=3, figsize=(16, 10))

### Artificial Neural Network (Multi-Layer Perceptron (MLP) Classifier)

In [None]:
# mlp = MLPClassifier()
# mlp.fit(X_train, y_train)

# preds_mlp = mlp.predict(X_test)
# accuracy_mlp = mlp.score(X_test, y_test)*100
# print('The accuracy for the MLP model is {}%'.format(accuracy_mlp))

In [None]:
# f1_mlp = f1_score(y_test.values, preds_mlp, average='weighted')
# print('f1-score:', f1_mlp, 'with average: weighted')

In [None]:
# cnf_matrix_mlp = confusion_matrix(y_test.values, preds_mlp)

# FP = cnf_matrix_mlp.sum(axis=0) - np.diag(cnf_matrix_mlp)  
# FN = cnf_matrix_mlp.sum(axis=1) - np.diag(cnf_matrix_mlp)
# TP = np.diag(cnf_matrix_mlp)
# TN = cnf_matrix_mlp.sum() - (FP + FN + TP)

# FP = FP.astype(float)
# FN = FN.astype(float)
# TP = TP.astype(float)
# TN = TN.astype(float)

# # Sensitivity, hit rate, recall, or true positive rate
# TPR = TP/(TP+FN)
# # Specificity or true negative rate
# TNR = TN/(TN+FP) 
# # Precision or positive predictive value
# PPV = TP/(TP+FP)
# # Negative predictive value
# NPV = TN/(TN+FN)
# # Fall out or false positive rate
# FPR = FP/(FP+TN)
# # False negative rate
# FNR = FN/(TP+FN)
# # False discovery rate
# FDR = FP/(TP+FP)
# # Overall accuracy
# ACC = (TP+TN)/(TP+FP+FN+TN)

# print('metric: specificity, with value:', TNR)

# tnr_highrisk_mlp, tnr_medrisk_mlp, tnr_lowrisk_mlp = TNR
# print(tnr_highrisk_mlp, tnr_medrisk_mlp, tnr_lowrisk_mlp)

# tnr_mlp = np.mean(TNR)
# print('average specificity:', tnr_mlp)

# # metrics = [TPR, TNR]
# # for metric in metrics:
# #     print('metric:', str(metric), 'with value:', metric)

In [None]:
# # perform hyperparameter tuning and output the best params and score
# mlp = MLPClassifier()
# params = {'alpha':[1e-6, 0.0001, 0.1]}

# mlp_gscv = GridSearchCV(mlp, params, cv=5)

# mlp_gscv.fit(X_train, y_train)

# preds_mlp = mlp_gscv.predict(X_test)
# accuracy_mlp_gscv = np.mean(y_test == preds_mlp)*100

# print('The accuracy for the refined MLP model is {}%'.format(accuracy_mlp_gscv))

# # NOTE: the following commented-out code is to print
# # what the best parameter values are and 
# # the mean cross-validated score of the best estimator
# print('The best parameters are:')
# for key, val in mlp_gscv.best_params_.items():
#     print(str(key) + ':', val)
# # print('\nThe best score for the refined Random Forest model is {}%'.format(rf_gscv.best_score_*100))

In [None]:
# print(classification_report(y_test, preds_mlp, target_names=target_names))

In [None]:
# f1_mlp_gscv = f1_score(y_test.values, preds_mlp, average='weighted')
# print('f1-score with grid search:', f1_mlp_gscv, 'with average: weighted')

In [None]:
# cnf_matrix_mlp_gscv = confusion_matrix(y_test.values, preds_mlp)

# FP = cnf_matrix_mlp_gscv.sum(axis=0) - np.diag(cnf_matrix_mlp_gscv)  
# FN = cnf_matrix_mlp_gscv.sum(axis=1) - np.diag(cnf_matrix_mlp_gscv)
# TP = np.diag(cnf_matrix_mlp_gscv)
# TN = cnf_matrix_mlp_gscv.sum() - (FP + FN + TP)

# FP = FP.astype(float)
# FN = FN.astype(float)
# TP = TP.astype(float)
# TN = TN.astype(float)

# # Sensitivity, hit rate, recall, or true positive rate
# TPR = TP/(TP+FN)
# # Specificity or true negative rate
# TNR = TN/(TN+FP) 
# # Precision or positive predictive value
# PPV = TP/(TP+FP)
# # Negative predictive value
# NPV = TN/(TN+FN)
# # Fall out or false positive rate
# FPR = FP/(FP+TN)
# # False negative rate
# FNR = FN/(TP+FN)
# # False discovery rate
# FDR = FP/(TP+FP)
# # Overall accuracy
# ACC = (TP+TN)/(TP+FP+FN+TN)

# print('metric: specificity, with value:', TNR)

# tnr_highrisk_mlp_gscv, tnr_medrisk_mlp_gscv, tnr_lowrisk_mlp_gscv = TNR
# print(tnr_highrisk_mlp_gscv, tnr_medrisk_mlp_gscv, tnr_lowrisk_mlp_gscv)

# tnr_mlp_gscv = np.mean(TNR)
# print('average specificity:', tnr_mlp_gscv)

# # metrics = [TPR, TNR]
# # for metric in metrics:
# #     print('metric:', str(metric), 'with value:', metric)

##### Plotting ROC and P-R Curves

In [None]:
# # fit a model with the best parameters
# mlp_best = MLPClassifier()
# mlp_best.fit(X_train, y_train)

# # plot multiclass P-R curve
# visualize_data.plot_precision_recall(
#     'Multi-Layer Perceptron', mlp_best, X_test, y_test, 
#     n_classes=3, figsize=(16, 10))

In [None]:
# # fit a model with the best parameters
# mlp_best = MLPClassifier()
# mlp_best.fit(X_train, y_train)

# # plot multiclass ROC curve
# visualize_data.plot_multiclass_roc(
#     'Multi-Layer Perceptron', mlp_best, X_test, y_test, 
#     n_classes=3, figsize=(16, 10))

### Model Comparison

In [None]:
models = ['Logistic Regression', 'K Nearest Neighbors',
         'Support Vector Machine', 'Naive Bayes', 'Random Forest', 'Decision Tree'] #'Multi-Layer Perceptron']
model_accuracies = [accuracy_lg, accuracy_knn, 
                    accuracy_svc, accuracy_gnb, accuracy_rf, accuracy_dt] #accuracy_mlp]
gscv_accuracies  = [accuracy_lr_gscv, accuracy_knn_gscv,
                    accuracy_svc_gscv, accuracy_gnb_gscv, accuracy_rf_gscv, accuracy_dt_gscv] #accuracy_mlp_gscv]
model_f1 = [f1_lr, f1_knn, f1_svc, f1_gnb, f1_rf, f1_dt] #f1_mlp]
gscv_f1  = [f1_lr_gscv, f1_knn_gscv, f1_svc_gscv, f1_gnb_gscv, f1_rf_gscv, f1_dt_gscv] #f1_mlp_gscv]
model_tnr = [tnr_lr, tnr_knn, tnr_svc, tnr_gnb, tnr_rf, tnr_dt] #tnr_mlp]
gscv_tnr  = [tnr_lr_gscv, tnr_knn_gscv, tnr_svc_gscv, tnr_gnb_gscv, tnr_rf_gscv, tnr_dt_gscv] #tnr_mlp_gscv]
model_comparison = pd.DataFrame(
    np.column_stack([models, model_accuracies, gscv_accuracies, model_f1, gscv_f1, model_tnr, gscv_tnr]),
    columns=['Model', 'Model Accuracy', 'Grid Search CV Accuracy', 'Model F1 Score', 'Grid Search CV F1 Score',
            'Model Average Specificity', 'Grid Search CV Average Specificity'])
data_types = ['float' for _ in range(len(model_comparison.columns)-1)]
data_dict = dict(zip(model_comparison.columns[1:], data_types))
model_comparison = model_comparison.astype(data_dict)
model_comparison = model_comparison.round(2)
model_comparison

In [None]:
# print the best model for each metric
# - if the best model for the metric is Support Vector Machine (SVM),
#   then print the second best model to account for SVM's alleged unreliability
# - else, print the actual best model
for column in model_comparison.columns:
    if column != 'Model':
        print('metric:', column)
        best_model = model_comparison.sort_values(column, ascending=False).reset_index(drop=True)['Model'][0]
        if best_model == 'Support Vector Machine':
            _2nd_best_model = model_comparison.sort_values(column, ascending=False).reset_index(drop=True)['Model'][1]
            print('the second best model:', _2nd_best_model)
        else:
            print('the best model:', best_model)
        print()