# 60:40 Split

In [1]:
import pickle 

with open('mydata.pkl', 'rb') as f:
    X_encoded_scaled, y = pickle.load(f)

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded_scaled, y, test_size=.40, random_state=0)

# Checking the size of the splits
print("Training set size:", X_train.shape, y_train.shape)
print("Testing set size:", X_test.shape, y_test.shape)

Training set size: (181, 25) (181,)
Testing set size: (122, 25) (122,)


# Modeling 

## Helper Functions

In [3]:
import numpy as np
from scipy import stats 

def calculate_acc_ci(accuracy, n, confidence=0.95):
    """
    Calculate the confidence interval for a classification accuracy.

    Parameters:
    - accuracy: The observed accuracy (proportion of correct classifications).
    - n: The total number of predictions made (sample size).
    - confidence: The desired confidence level.

    Returns:
    - A tuple containing the lower and upper bounds of the confidence interval.
    """
    # Calculate the z-score for the desired confidence level
    z = np.abs(stats.norm.ppf((1 - confidence) / 2))
    
    # Calculate the margin of error
    margin_of_error = z * np.sqrt((accuracy * (1 - accuracy)) / n)
    
    # Calculate the confidence interval
    ci_lower = accuracy - margin_of_error
    ci_upper = accuracy + margin_of_error
    
    return ci_lower, ci_upper

In [4]:
from math import sqrt
from scipy.stats import norm

def calculate_sensitivity_confidence_interval(TP, FN, confidence_level=0.95):
    """
    Calculate the confidence interval for sensitivity (true positive rate).
    
    Parameters:
    - TP: Number of true positives.
    - FN: Number of false negatives.
    - confidence_level: Desired confidence level for the interval.
    
    Returns:
    - A tuple containing the lower and upper bounds of the confidence interval.
    """
    # Calculate the point estimate of sensitivity
    sensitivity = TP / (TP + FN)
    
    # Calculate the standard error
    n = TP + FN
    standard_error = sqrt(sensitivity * (1 - sensitivity) / n)
    
    # Find the z-score for the confidence level
    z_score = norm.ppf((1 + confidence_level) / 2)
    
    # Calculate the margin of error
    margin_of_error = z_score * standard_error
    
    # Calculate the confidence interval
    ci_lower = sensitivity - margin_of_error
    ci_upper = sensitivity + margin_of_error
    
    return ci_lower, ci_upper


In [5]:
def binary_sensitivity(c_matrix):
    """
    Calculate the true positives, false negatives, and sensitivity from confusion matrix

    Parameters:
    - TP: Number of true positives.
    - FN: Number of false negatives.
    - confidence_level: Desired confidence level for the interval.

    Returns
    - TP: int
        Number of true positives in confusion matrix
    - FN: int
        Number of false negatives in confusion matrix
    - sen: float
        Sensitivity score     
    """
    TP = c_matrix[1, 1]
    FN = c_matrix[1,0]

    sen = TP / (TP + FN)

    return TP, FN, sen

In [6]:
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, accuracy_score

def evaluation_metrics(pred, pred_proba, cm, X_test=X_test, y_test=y_test):
    acc = accuracy_score(y_test, pred)
    _, _, sen = binary_sensitivity(cm)
    f1 = f1_score(y_test, pred)
    auc_roc = roc_auc_score(y_test, pred_proba)
    return acc, sen, f1, auc_roc


## Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegressionCV

lr = LogisticRegressionCV(cv=5, random_state=1, solver='liblinear')
lr.fit(X_train, y_train)

lr_pred = lr.predict(X_test)
lr_scores = lr.predict_proba(X_test)[:, 1]
lr_cm = confusion_matrix(y_test, lr_pred)

lr_acc, lr_sen, lr_f1_score, lr_auc_roc = evaluation_metrics(lr_pred, lr_scores, lr_cm)

print(f'LR Acc. Score: {lr_acc:.2f}')
print(f'LR Sen. Score: {lr_sen:.2f}')
print(f'LR F1 Score: {lr_f1_score:.2f}')
print(f'LR AUC-ROC: {lr_auc_roc:.2f}')

LR Acc. Score: 0.81
LR Sen. Score: 0.77
LR F1 Score: 0.81
LR AUC-ROC: 0.89


## Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn. model_selection import GridSearchCV

param_grid = {
    'n_estimators' : [100, 200, 300],
    'max_depth' : [None, 10, 20, 30],
}

rf = RandomForestClassifier(random_state=2)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=0)

grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)


Best parameters: {'max_depth': None, 'n_estimators': 300}
Best cross-validation score: 0.8451951951951951


In [9]:
clf = RandomForestClassifier(n_estimators=300, random_state=3)
clf.fit(X_train, y_train)

clf_pred = clf.predict(X_test)
clf_scores = clf.predict_proba(X_test)[:, 1]
clf_cm = confusion_matrix(y_test, clf_pred)

clf_acc, clf_sen, clf_f1_score, clf_auc_roc = evaluation_metrics(clf_pred, clf_scores, clf_cm)

print(f'CLF Acc. Score: {clf_acc:.2f}')
print(f'CLF Sen. Score: {clf_sen:.2f}')
print(f'CLF F1 Score: {clf_f1_score:.2f}')
print(f'CLF AUC-ROC: {clf_auc_roc:.2f}')

CLF Acc. Score: 0.76
CLF Sen. Score: 0.71
CLF F1 Score: 0.75
CLF AUC-ROC: 0.86


## Support-Vector Machine

In [10]:
from sklearn.svm import SVC

# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Type of kernel
    'gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    'degree': [2, 3, 4]  # Degree of the polynomial kernel function ('poly'). Ignored by other kernels.
}

# Initialize the SVC
svc = SVC(random_state=4)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, verbose=0, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


Best parameters found:  {'C': 1, 'degree': 2, 'gamma': 'auto', 'kernel': 'rbf'}
Best cross-validation score:  0.8786786786786787


In [16]:
svm = SVC(kernel='rbf', C=1, degree=2, gamma='auto', probability=True)
svm.fit(X_train, y_train)

svm_pred = svm.predict(X_test)
svm_scores = svm.predict_proba(X_test)[:, 1]
svm_cm = confusion_matrix(y_test, svm_pred)

svm_acc, svm_sen, svm_f1_score, svm_auc_roc = evaluation_metrics(svm_pred, svm_scores, svm_cm)

print(f'SVM Acc. Score: {svm_acc:.2f}')
print(f'SVM Sen. Score: {svm_sen:.2f}')
print(f'SVM F1 Score: {svm_f1_score:.2f}')
print(f'SVM AUC-ROC: {svm_auc_roc:.2f}')


SVM Acc. Score: 0.80
SVM Sen. Score: 0.69
SVM F1 Score: 0.78
SVM AUC-ROC: 0.88


## K-Nearest Neighbors

In [17]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors' : [3, 5, 7, 9, 11],
    'weights' : ['uniform', 'distance']
}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, verbose=0, n_jobs=-1)

grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters found: {'n_neighbors': 5, 'weights': 'uniform'}
Best cross-validation score: 0.8567567567567569


In [21]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(X_train, y_train)

knn_pred = knn.predict(X_test)
knn_scores = knn.predict_proba(X_test)[:, 1]
knn_cm = confusion_matrix(y_test, knn_pred)

knn_acc, knn_sen, knn_f1_score, knn_auc_roc = evaluation_metrics(knn_pred, knn_scores, knn_cm)

print(f'KNN Acc. Score: {knn_acc:.2f}')
print(f'KNN Sen. Score: {knn_sen:.2f}')
print(f'KNN F1 Score: {knn_f1_score:.2f}')
print(f'KNN AUC-ROC: {knn_auc_roc:.2f}')

KNN Acc. Score: 0.80
KNN Sen. Score: 0.69
KNN F1 Score: 0.78
KNN AUC-ROC: 0.85


# Evaluation

In [22]:
import pandas as pd 

models = ['LR', 'RF', 'SVM', 'KNN']
acc_scores = [lr_acc, clf_acc, svm_acc, knn_acc]
sen_scores = [lr_sen, clf_sen, svm_sen, knn_sen]
f1_scores = [lr_f1_score, clf_f1_score, svm_f1_score, knn_f1_score]
auc_roc_scores = [lr_auc_roc, clf_auc_roc, svm_auc_roc, knn_auc_roc]

results_df = pd.DataFrame(zip(models, acc_scores, sen_scores, f1_scores, auc_roc_scores),
                        columns=['Model', 'ACC', 'SEN', 'F1', 'AUC-ROC'])

results_df

Unnamed: 0,Model,ACC,SEN,F1,AUC-ROC
0,LR,0.811475,0.774194,0.806723,0.887097
1,RF,0.762295,0.709677,0.752137,0.861425
2,SVM,0.803279,0.693548,0.781818,0.876344
3,KNN,0.803279,0.693548,0.781818,0.845565


In [23]:
from tabulate import tabulate 

formatted_results_df = results_df.round(2)

print(tabulate(formatted_results_df, headers='keys', tablefmt='fancy_grid', showindex=False, numalign='center'))

╒═════════╤═══════╤═══════╤══════╤═══════════╕
│ Model   │  ACC  │  SEN  │  F1  │  AUC-ROC  │
╞═════════╪═══════╪═══════╪══════╪═══════════╡
│ LR      │ 0.81  │ 0.77  │ 0.81 │   0.89    │
├─────────┼───────┼───────┼──────┼───────────┤
│ RF      │ 0.76  │ 0.71  │ 0.75 │   0.86    │
├─────────┼───────┼───────┼──────┼───────────┤
│ SVM     │  0.8  │ 0.69  │ 0.78 │   0.88    │
├─────────┼───────┼───────┼──────┼───────────┤
│ KNN     │  0.8  │ 0.69  │ 0.78 │   0.85    │
╘═════════╧═══════╧═══════╧══════╧═══════════╛
