# Modeling

In [None]:
## Import all the libraries
import pandas as pd
import numpy as np

## Data Viz 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
from matplotlib import rc

## Transformation
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

## for statistical tests
from math import sqrt
import scipy
from scipy.fft import fft, fftfreq
import statistics
from statistics import mean

## SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler,  ClusterCentroids
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from sklearn.svm import SVC

## Cluster Centroids
from imblearn.under_sampling import ClusterCentroids
from sklearn.cluster import KMeans

## Modelling 
from sklearn import datasets, decomposition, ensemble, feature_selection, linear_model, metrics, model_selection, preprocessing, svm, tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, make_scorer, confusion_matrix, explained_variance_score, f1_score, mean_absolute_error, mean_squared_error, precision_score, r2_score, recall_score, roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, auc, precision_recall_fscore_support
from sklearn.model_selection import cross_validate, cross_val_predict, cross_val_score, GridSearchCV, ShuffleSplit, train_test_split, RandomizedSearchCV
from sklearn.calibration import calibration_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from xgboost import XGBClassifier
import xgboost as xgb

import shap

from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
import time

from tabulate import tabulate

print('Libraries imported successfully')

In [None]:
# set the style for the notebook, in this case colorblind, alternative would be grayscale
uf.set_custom_palette("colorblind")
figures_folder = '../figures' 

In [None]:
# Set APA style parameters
plt.style.use('/kaggle/input/apa-mplstyle/apa.mplstyle') # selecting the style sheet
rc('mathtext', **{'default': 'regular'})
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print('APA Style functions defined')

In [None]:
# Load Code for Kaggle
df = pd.read_csv('/kaggle/input/arrhythmia-preprocessed/arrhythmia_preprocessed_cleaned_classes_label(1).csv', sep=',', index_col=0)
print('Dataset imported successfully')
df.head(3)

In [None]:
# Load Code for GitHub
# df = pd.read_csv('arrhythmia_preprocessed_cleaned_classes_label.csv')

# Seperate Features, Standardize and Split

In [None]:
# Separate features and target variable
X = df.drop(['class','label'], axis=1)  # Features
y = df['label']  # Target variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Save X train and X_test without PCA 
X_train_saved = X_train
X_test_saved = X_test
y_train_saved = y_train
y_test_saved = y_test

## PCA - Principal Component Analysis

In [None]:
# Define different numbers of components to try
n_components_list = [10,20,30,40,50,60,70,80,90,100,110]
cumulative_variance_ratios = []

# Initialize a list to store transformed dataframes
transformed_dataframes = []

for n_components in n_components_list:
    # Instantiate PCA with desired number of components
    pca = PCA(n_components=n_components)
    
    # Fit PCA to the standardized data
    pca.fit(X_scaled)
    
    # Transform the data into the new feature space
    X_pca = pca.transform(X_scaled)
    
    # Save the transformed data in a dataframe
    transformed_dataframes.append(pd.DataFrame(X_pca))
    
    # Get the explained variance ratio
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance_ratio = sum(explained_variance_ratio)
    cumulative_variance_ratios.append(cumulative_variance_ratio)
    
    # Print cumulative variance ratio
    #print(f'Cumulative variance ratio with {n_components} components:', cumulative_variance_ratio)

# Plot cumulative variance ratio
plt.figure(figsize=(8, 6))
plt.plot(n_components_list, cumulative_variance_ratios, marker='o', linestyle='--', color='black')
plt.title('Cumulative Variance Ratio vs. Number of Components', fontsize=14)
plt.xlabel('Number of Components', fontsize=14)
plt.ylabel('Cumulative Variance Ratio', fontsize=14)

# Set x-ticks to intervals of 10
plt.xticks(np.arange(0, max(n_components_list) + 1, 10))

# Add vertical grid lines in grey
plt.grid(True, axis='x', color='grey', linestyle='--')

# Add annotations
for i, txt in enumerate(cumulative_variance_ratios):
    plt.annotate(f'{txt:.2f}', (n_components_list[i], cumulative_variance_ratios[i]), textcoords="offset points", xytext=(-15,10), ha='center')
plt.tight_layout()
plt.savefig('PCA_cumulative_variance_ratio_vs_nr_components.png')
plt.show()


In [None]:
# Re-run PCA with the selected number of components 
X_pca = pca.fit_transform(X_scaled)

# Get the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Visualize the explained variance ratio
plt.figure(figsize=(8, 6))
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, color='black', width=0.75)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance Ratio for Principal Components')
plt.xlim(0, len(explained_variance_ratio) + 1)
#plt.ylim(0, 1)
plt.show()


# Setting the PCA Threshold for the Training & Test Datasets

In [None]:
# PCA on training data set 
pca = PCA(n_components = .9)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(pca.n_components_)

In [None]:
# Initialize PCA with 'mle' to automatically determine the number of components
#pca = PCA(n_components='mle')
# Fit PCA to the standardized data
#pca.fit(X_scaled)
# Get the number of components that explain at least 90% of the variance
#n_components = pca.explained_variance_ratio_.cumsum().searchsorted(0.90) + 1
# Print the selected number of components
#print(f'Selected number of components: {n_components}')

In [None]:
#SMOTE
#smo = SMOTE()
#X_train_sm, y_train_sm = smo.fit_resample(X_train, y_train)

#print("Shape of X_train resampled with smote:", X_train_sm.shape)
#print("Shape of y_train resampled with smote:", y_train_sm.shape)
#print('SMOTE :', dict(pd.Series(y_train_sm).value_counts()))

# Set X_train and X_test 

In [None]:
X_train = X_train_pca
X_test = X_test_pca
print(X_train.shape)
print(X_test.shape)

**Model Experiments**

# Define Classifiers and parameters

In [None]:
# Set random seed for NumPy
np.random.seed(42)

# Define classifiers with specified parameters
clf_lr = LogisticRegression(random_state=22, max_iter=2000)
clf_rf = RandomForestClassifier(random_state=22)
clf_svc = SVC(random_state=22)
clf_en = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000)
clf_gb = GradientBoostingClassifier(random_state=42)
clf_ada = AdaBoostClassifier()
clf_xgb = xgb.XGBClassifier()


# Define parameter grids for each classifier
param_grid_lr = [{'C': [c], 'penalty': [penalty]} for c in np.logspace(-4, 2, 9) for penalty in ['l1', 'l2']]

param_grid_rf = [{'n_estimators': [10, 50, 100, 250, 500, 1000], 
                  'min_samples_leaf': [1, 3, 5], 
                  'max_features': ['sqrt', 'log2']}]

param_grid_svc = {'C': np.logspace(-4, 2, 9), 'kernel': ['linear', 'rbf']}
param_grid_svc_list = [{'C': [c], 'kernel': [kernel]} for c in np.logspace(-4, 2, 9) for kernel in ['linear', 'rbf']]

param_grid_en = {'C': np.logspace(-4, 2, 9), 'l1_ratio': np.linspace(0.1, 0.9, 9)}
param_grid_en_list = [{'C': [C], 'l1_ratio': [l1_ratio]} for C in np.logspace(-4, 2, 9) for l1_ratio in np.linspace(0.1, 0.9, 9)]


param_grid_gb = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0], 'max_depth': [3, 5, 7]}
param_grid_gb_list = [{'n_estimators': [n_estimators], 'learning_rate': [learning_rate], 'max_depth': [max_depth]} 
                                 for n_estimators in [50, 100, 200] 
                                 for learning_rate in [0.01, 0.1, 1.0] 
                                 for max_depth in [3, 5, 7]]

param_grid_ada = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]}
param_grid_ada_list = [{'n_estimators': [n_estimators], 'learning_rate': [learning_rate]} 
                       for n_estimators in [50, 100, 200] 
                       for learning_rate in [0.01, 0.1, 1.0]]

param_grid_xgb = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0], 'max_depth': [3, 5, 7]}
param_grid_xgb_list = [{'n_estimators': [n_estimators], 'learning_rate': [learning_rate], 'max_depth': [max_depth]} 
                       for n_estimators in [50, 100, 200] 
                       for learning_rate in [0.01, 0.1, 1.0] 
                       for max_depth in [3, 5, 7]]

print('parameters set')

# Model Training Function with Progress Bar and Score Output

In [None]:
def perform_randomized_search(clf, param_grid, X_train, y_train, X_test, y_test):
    scoring = {'recall': make_scorer(recall_score)}
    print("Starting hyperparameter search...")
    start_total = time.time()
    gcv = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=10, cv=3, random_state=42, n_jobs=-1, verbose=0)
    
    start_search = time.time()
    gcv.fit(X_train, y_train)
    end_search = time.time()

    # Get the number of total fits for estimation
    total_fits = len(gcv.cv_results_['mean_fit_time']) * gcv.n_splits_
    current_fit = 0

    print("Searching...")
    for current_iter in range(1, 11):
        elapsed_time = time.time() - start_search
        remaining_time = (elapsed_time / current_iter) * (10 - current_iter)
        print(f"Iteration {current_iter} completed. Estimated remaining time: {remaining_time:.2f} seconds.", end='\r')
        time.sleep(1)  # Simulate processing time

    end_total = time.time()
    elapsed_total = end_total - start_total
    fit_time = end_search - start_search
    cv_time = gcv.refit_time_

    train_predictions = gcv.predict(X_train)
    test_predictions = gcv.predict(X_test)
    train_metrics = get_metrics(y_train, train_predictions)
    test_metrics = get_metrics(y_test, test_predictions)
    
    print(f"\nHyperparameter search completed in {elapsed_total:.2f} seconds.")
    
    return gcv.best_params_, train_metrics, test_metrics, elapsed_total

print('ok')

# Evaluation Function for Metrics Display and Confusion Matrix

In [None]:
# Function to calculate evaluation metrics
def get_metrics(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='binary')
    recall = recall_score(true_labels, predicted_labels, average='binary')
    f1 = f1_score(true_labels, predicted_labels, average='binary')
    roc_auc = roc_auc_score(true_labels, predicted_labels)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'roc_auc': roc_auc}

def display_metrics(model_name, train_metrics, test_metrics):
    # Combine train and test metrics into a DataFrame
    metrics_df = pd.DataFrame({'Train': train_metrics, 'Test': test_metrics})
    
    # Add a row for the metric names and set it as the index
    metrics_df = metrics_df.T.rename(index={'accuracy': 'Accuracy', 'precision': 'Precision', 'recall': 'Recall', 'f1': 'F1 Score', 'roc_auc': 'ROC AUC'})
    
    # Format numerical values to display a maximum of 4 decimal points
    metrics_df = metrics_df.round(4)
    
    # Add model name as headline
    print(f"\n{'='*20}\n{model_name}\n{'='*20}\n")
    
    # Display the DataFrame
    display(metrics_df)

print('Evaluation Functions defined')

In [None]:
def plot_confusion_matrix(y_true, y_pred, title=None, labels=None):
    """
    Plot a confusion matrix.
    
    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels.
        title (str): Title for the confusion matrix plot.
        labels (list): List of label names.
    """
    cm = confusion_matrix(y_true, y_pred)
    total_samples = np.sum(cm)
    percentages = (cm.T / np.sum(cm, axis=1)).T * 100
    annotations = [[f'({cm[i, j]})\n{percentages[i, j]:.2f}%' for j in range(len(cm))] for i in range(len(cm))]
    if labels is None:
        labels = [str(i) for i in range(len(cm))]
    sns.heatmap(cm, annot=annotations, fmt='', cmap='binary', cbar=False,
                xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    if title:
        plt.title(title)
    plt.show()

def plot_confusion_matrices_side_by_side(model_name, data, titles, labels=None):
    """
    Plot confusion matrices side by side.
    
    Args:
        model_name (str): Name of the model.
        data (list of tuples): Each tuple contains true labels and predicted labels for a set of data.
        titles (list of str): Titles for each confusion matrix plot.
        labels (list): List of label names.
    """
    num_matrices = len(data)
    fig, ax = plt.subplots(1, num_matrices, figsize=(7 * num_matrices, 7))
    fig.subplots_adjust(wspace=0.5) 

    def add_labels_and_percentages(conf_matrix, ax):
        total_samples = np.sum(conf_matrix)
        group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        labels = [f'{name}\n{count}\n{count/total_samples:.2%}' for name, count in zip(group_names, conf_matrix.flatten())]
        labels = np.asarray(labels).reshape(2, 2)
        sns.heatmap(conf_matrix, annot=labels, fmt='', cmap='binary', ax=ax,
                    annot_kws={'size': 14, 'fontweight': 'normal'}, cbar=False,
                    xticklabels=['0', '1'], yticklabels=['0', '1']) 

    for i, (y_true, y_pred) in enumerate(data):
        conf_matrix = confusion_matrix(y_true, y_pred, labels=labels)
        ax[i].set_title(titles[i], fontsize=14, fontweight='bold')
        add_labels_and_percentages(conf_matrix, ax[i])

        ax[i].set_xlabel('Predicted labels', fontsize=14)  
        ax[i].set_ylabel('True labels', fontsize=14) 
        for tick in ax[i].xaxis.get_major_ticks():
            tick.label.set_fontsize(10) 
        for tick in ax[i].yaxis.get_major_ticks():
            tick.label.set_fontsize(10)  

    fig.suptitle(model_name, fontsize=18, fontweight='bold', y=1.05, ha='center')
    plt.show()

print('Confusion Matrix Functions defined')


# Logistic Regression

In [None]:
# Logistic Regression
print("Logistic Regression")

# Perform hyperparameter tuning for LogReg
best_params_lr, train_metrics_lr, test_metrics_lr, elapsed_total_lr = perform_randomized_search(clf_lr, param_grid_lr, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_lr_best = LogisticRegression(**best_params_lr)
clf_lr_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_lr = clf_lr_best.predict(X_train)
test_predictions_lr = clf_lr_best.predict(X_test)

# Store results 
lr_results = {
    'best_params': best_params_lr,
    'train_metrics': train_metrics_lr,
    'test_metrics': test_metrics_lr,
    'elapsed_total': elapsed_total_lr,
    'test_predictions': test_predictions_lr,
}

# Displaying the model metrics
print("Best Parameters:", best_params_lr)
display_metrics('Logistic Regression', train_metrics_lr, test_metrics_lr)

# Plot confusion matrix for training data
titles = ['LR Training Data', 'LR Testing Data']
plot_confusion_matrices_side_by_side('Logistic Regression', [(y_train, train_predictions_lr), (y_test, test_predictions_lr)], titles)

# Random Forest

In [None]:
print("Random Forest")
# Perform hyperparameter tuning for RandomForest
best_params_rf, train_metrics_rf, test_metrics_rf, elapsed_total_rf = perform_randomized_search(clf_rf, param_grid_rf, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_rf_best = RandomForestClassifier(**best_params_rf)
clf_rf_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_rf = clf_rf_best.predict(X_train)
test_predictions_rf = clf_rf_best.predict(X_test)

# Store results 
rf_results = {
    'best_params': best_params_rf,
    'train_metrics': train_metrics_rf,
    'test_metrics': test_metrics_rf,
    'elapsed_total': elapsed_total_rf,
    'test_predictions': test_predictions_rf,
}

# Displaying the model metrics
print("Best Parameters:", best_params_rf)
display_metrics('Random Forest', train_metrics_rf, test_metrics_rf)

# Plot confusion matrix for training data
titles = ['RF Training Data', 'RF Testing Data']
plot_confusion_matrices_side_by_side('Random Forest', [(y_train, train_predictions_rf), (y_test, test_predictions_rf)], titles)

# SVC

In [None]:
print("SVC")
# Perform hyperparameter tuning for RandomForest
best_params_svc, train_metrics_svc, test_metrics_svc, elapsed_total_svc = perform_randomized_search(clf_svc, param_grid_svc, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_svc_best = SVC(**best_params_svc)
clf_svc_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_svc = clf_svc_best.predict(X_train)
test_predictions_svc = clf_svc_best.predict(X_test)

# Store results 
svc_results = {
    'best_params': best_params_svc,
    'train_metrics': train_metrics_svc,
    'test_metrics': test_metrics_svc,
    'elapsed_total': elapsed_total_svc,
    'test_predictions': test_predictions_svc
}

# Displaying the model metrics
print("Best Parameters:", best_params_svc)
display_metrics('SVC', train_metrics_svc, test_metrics_svc)

# Plot confusion matrix for training data
titles = ['SVC Training Data', 'SVC Testing Data']
plot_confusion_matrices_side_by_side('SVC', [(y_train, train_predictions_svc), (y_test, test_predictions_svc)], titles)

# ElasticNet

In [None]:
print("ElasticNet")
# Perform hyperparameter tuning for ElasticNet
best_params_en, train_metrics_en, test_metrics_en, elapsed_total_en = perform_randomized_search(clf_en, param_grid_en, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_en_best = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, **best_params_en)
clf_en_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_en = clf_en_best.predict(X_train)
test_predictions_en = clf_en_best.predict(X_test)

# Store results 
en_results = {
    'best_params': best_params_en,
    'train_metrics': train_metrics_en,
    'test_metrics': test_metrics_en,
    'elapsed_total': elapsed_total_en,
    'test_predictions': test_predictions_en
}

# Displaying the model metrics
print("Best Parameters:", best_params_en)
display_metrics('ElasticNet', train_metrics_en, test_metrics_en)

# Plot confusion matrix for training data
titles = ['EN Training Data', 'EN Testing Data']
plot_confusion_matrices_side_by_side('ElasticNet', [(y_train, train_predictions_en), (y_test, test_predictions_en)], titles)

# Gradientboost

In [None]:
print("Gradient Boosting Classifier")
# Perform hyperparameter tuning for GBoost
best_params_gb, train_metrics_gb, test_metrics_gb, elapsed_total_gb = perform_randomized_search(clf_gb, param_grid_gb, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_gb_best = GradientBoostingClassifier(**best_params_gb)
clf_gb_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_gb = clf_gb_best.predict(X_train)
test_predictions_gb = clf_gb_best.predict(X_test)

# Store results 
gb_results = {
    'best_params': best_params_gb,
    'train_metrics': train_metrics_gb,
    'test_metrics': test_metrics_gb,
    'elapsed_total': elapsed_total_gb,
    'test_predictions': test_predictions_gb
}

# Displaying the model metrics
print("Best Parameters:", best_params_gb)
display_metrics('Gradient Boost', train_metrics_gb, test_metrics_gb)

# Plot confusion matrix for training data
titles = ['GB Training Data', 'GB Testing Data']
plot_confusion_matrices_side_by_side('GradientBoost', [(y_train, train_predictions_gb), (y_test, test_predictions_gb)], titles)

# Adaboost

In [None]:
print("AdaBoost Classifier")
# Perform hyperparameter tuning for AdaBoost
best_params_ada, train_metrics_ada, test_metrics_ada, elapsed_total_ada = perform_randomized_search(clf_ada, param_grid_ada, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_ada_best = AdaBoostClassifier(**best_params_ada)
clf_ada_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_ada = clf_ada_best.predict(X_train)
test_predictions_ada = clf_ada_best.predict(X_test)

# Store results 
ada_results = {
    'best_params': best_params_ada,
    'train_metrics': train_metrics_ada,
    'test_metrics': test_metrics_ada,
    'elapsed_total': elapsed_total_ada,
    'test_predictions': test_predictions_ada
}

# Displaying the model metrics
print("Best Parameters:", best_params_ada)
display_metrics('Ada Boost', train_metrics_ada, test_metrics_ada)

# Plot confusion matrix for training data
titles = ['ADA Training Data', 'ADA Testing Data']
plot_confusion_matrices_side_by_side('AdaBoost', [(y_train, train_predictions_ada), (y_test, test_predictions_ada)], titles)

# XGBoost

In [None]:
print("XGBoost Classifier")
# Perform hyperparameter tuning for XGBoost
best_params_xgb, train_metrics_xgb, test_metrics_xgb, elapsed_total_xgb = perform_randomized_search(clf_xgb, param_grid_xgb, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_xgb_best = XGBClassifier(**best_params_xgb)
clf_xgb_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_xgb = clf_xgb_best.predict(X_train)
test_predictions_xgb = clf_xgb_best.predict(X_test)

# Store results 
xgb_results = {
    'best_params': best_params_xgb,
    'train_metrics': train_metrics_xgb,
    'test_metrics': test_metrics_xgb,
    'elapsed_total': elapsed_total_xgb,
    'test_predictions': test_predictions_xgb
}

# Displaying the model metrics
print("Best Parameters:", best_params_xgb)
display_metrics('XGBoost', train_metrics_xgb, test_metrics_xgb)

# Plot confusion matrix for training data
titles = ['XGB Training Data', 'XGB Testing Data']
plot_confusion_matrices_side_by_side('XGBoost', [(y_train, train_predictions_xgb), (y_test, test_predictions_xgb)], titles)

# Model Comparison

In [None]:
original_results = [lr_results, rf_results, svc_results, en_results, ada_results, gb_results, xgb_results]

In [None]:
# Define model names and their corresponding results
models = ['LR', 'RF', 'SVC', 'EN', 'ADA', 'GB', 'XGB']
results = [lr_results, rf_results, svc_results, en_results, ada_results, gb_results, xgb_results]

In [None]:
# Extract test and train accuracy scores for each model
test_accuracies = [result['test_metrics']['accuracy'] for result in results]
train_accuracies = [result['train_metrics']['accuracy'] for result in results]

# Extract test and train recall scores for each model
test_recalls = [result['test_metrics']['recall'] for result in results]
train_recalls = [result['train_metrics']['recall'] for result in results]

# Bar width
bar_width = 0.2
index = np.arange(len(models))
# Bar width
bar_width = 0.2
index = np.arange(len(models))

# Plotting
plt.figure(figsize=(12, 8))

# Plot test accuracy
plt.bar(index, test_accuracies, bar_width, edgecolor='black', hatch='/', label='Test Accuracy')
# Plot train accuracy
plt.bar(index + bar_width, train_accuracies, bar_width, edgecolor='black', hatch='\\', label='Train Accuracy')

# Plot test recall
plt.bar(index + 2*bar_width, test_recalls, bar_width, edgecolor='black', hatch='x', label='Test Recall')
# Plot train recall
plt.bar(index + 3*bar_width, train_recalls, bar_width, edgecolor='black', hatch='.', label='Train Recall')

plt.xlabel('Model')
plt.ylabel('Scores')
plt.title('Comparison of Model Performances')
plt.xticks(index + 1.5*bar_width, models)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')

plt.show()


# AUROC for best performing models

In [None]:
# Plot ROC curves for each model
plt.figure(figsize=(12, 8))

for model_name, result in zip(models, results):
    # Get the correct test predictions variable for the current model
    if 'test_predictions' in result:
        test_predictions = result['test_predictions']
    elif 'test_metrics' in result and 'predictions' in result['test_metrics']:
        test_predictions = result['test_metrics']['predictions']
    else:
        raise KeyError(f"Test predictions not found for model: {model_name}")
        
    # Compute ROC curve for test data
    fpr, tpr, _ = roc_curve(y_test, test_predictions)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {result["test_metrics"]["roc_auc"]:.2f})')

# Plot ROC curve for random guessing (baseline)
plt.plot([0, 1], [0, 1], linestyle='--', color='black', label='Random Guessing')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()

plt.show()


# Model Metrics Display with Highlights

In [None]:
# Function to apply different highlighting based on column names
def highlight_max_min(col):
    if col.name.startswith('Elapsed Total Time (s)'):
        max_val = col.max()
        min_val = col.min()
        return ['background-color: #FFD699' if x == max_val else 'background-color: #ADD8E6' if x == min_val else '' for x in col]
    else:
        return ['background-color: #D3ECA1' if x == col.max() else '' for x in col]

In [None]:
# Define model names and their corresponding results
models = ['LR', 'RF', 'SVC', 'EN', 'ADA', 'GB', 'XGB']
original_results = [lr_results, rf_results, svc_results, en_results, ada_results, gb_results, xgb_results]

# Create a list of dictionaries containing the test scores and elapsed total time for each model
original_scores = []
for model_name, result in zip(models, original_results):
    original_scores.append({
        'Model': model_name,
        'Accuracy': result['test_metrics']['accuracy'],
        'Precision': result['test_metrics']['precision'],
        'Recall': result['test_metrics']['recall'],
        'F1 Score': result['test_metrics']['f1'],
        'ROC AUC': result['test_metrics']['roc_auc'],
        'Elapsed Total Time (s)': result['elapsed_total']
    })

# Create a DataFrame from the list of dictionaries
scores_df = pd.DataFrame(original_scores)

# Set the 'Model' column as the index
scores_df.set_index('Model', inplace=True)

# Find the maximum value for each train score
max_accuracy = scores_df['Accuracy'].max()
max_recall = scores_df['Recall'].max()
max_precision = scores_df['Precision'].max()
max_f1 = scores_df['F1 Score'].max()
max_roc_auc = scores_df['ROC AUC'].max()

# Find the minimum value for elapsed total time
min_time = scores_df['Elapsed Total Time (s)'].min()
max_time = scores_df['Elapsed Total Time (s)'].max()

scores_df

In [None]:
# Apply the styling function to highlight the maximum value in each column
styled_df = scores_df.style.apply(highlight_max_min)

# Display the styled DataFrame
styled_df

# CV for Optimal PCA

In [None]:
# Extract best parameters if they exist, otherwise set defaults
best_kernel = best_params_svc.get('kernel', 'rbf')
best_C = best_params_svc.get('C', 1.0)

# Define the pipeline with PCA followed by SVC using the best parameters
pipeline = Pipeline([
    ('pca', PCA()),
    ('svc', SVC(kernel=best_kernel, C=best_C)) 
])

num_features = max(X_train.shape[1], 1)
param_grid = {
    'pca__n_components': range(1, min(num_features, 200)),
}

# Define parameter grid for PCA 
#param_grid = {
#    'pca__n_components': range(1, min(X_train.shape[1], 200)),
#}

# Initialize variables to store the best PCA and its corresponding score
best_pca = None
best_score = float('-inf')
mean_scores = []

# Perform grid search with cross-validation
for n_components in param_grid['pca__n_components']:
    # Define the pipeline with PCA followed by SVM using the best parameters
    pipeline = Pipeline([
        ('pca', PCA(n_components=n_components)),
        ('svc', SVC(kernel=best_kernel, C=best_C))
    ])
    
    # Perform cross-validation and get mean accuracy
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    mean_score = np.mean(scores)
    
    # Check if the current PCA is better than the previous best
    if mean_score > best_score:
        best_score = mean_score
        best_pca = n_components
    
    # Store the mean accuracy
    mean_scores.append(mean_score)

# Print the best performing PCA and its corresponding score
print("Best Performing PCA (SVC):", best_pca)
print("Best Cross-validation Accuracy (SVC):", best_score)

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(param_grid['pca__n_components'], mean_scores, '-o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cross-validation Accuracy')
plt.title('Cross-validation Accuracy vs. Number of Principal Components')
plt.grid(True)
plt.show()


In [None]:
# PCA on training data set 
pca = PCA(n_components = best_pca)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

print(pca.n_components_)

# Rerunning Models with Optimal PCA

In [None]:
# Logistic Regression
print("Logistic Regression")

# Perform hyperparameter tuning for LogReg
best_params_lr, train_metrics_lr, test_metrics_lr, elapsed_total_lr = perform_randomized_search(clf_lr, param_grid_lr, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_lr_best = LogisticRegression(**best_params_lr)
clf_lr_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_lr = clf_lr_best.predict(X_train)
test_predictions_lr = clf_lr_best.predict(X_test)

# Store results 
lr_results = {
    'best_params': best_params_lr,
    'train_metrics': train_metrics_lr,
    'test_metrics': test_metrics_lr,
    'elapsed_total': elapsed_total_lr,
    'test_predictions': test_predictions_lr,
}

# Displaying the model metrics
print("Best Parameters:", best_params_lr)
display_metrics('Logistic Regression', train_metrics_lr, test_metrics_lr)


In [None]:
print("Random Forest")
# Perform hyperparameter tuning for RandomForest
best_params_rf, train_metrics_rf, test_metrics_rf, elapsed_total_rf = perform_randomized_search(clf_rf, param_grid_rf, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_rf_best = RandomForestClassifier(**best_params_rf)
clf_rf_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_rf = clf_rf_best.predict(X_train)
test_predictions_rf = clf_rf_best.predict(X_test)

# Store results 
rf_results = {
    'best_params': best_params_rf,
    'train_metrics': train_metrics_rf,
    'test_metrics': test_metrics_rf,
    'elapsed_total': elapsed_total_rf,
    'test_predictions': test_predictions_rf,
}

# Displaying the model metrics
print("Best Parameters:", best_params_rf)
display_metrics('Random Forest', train_metrics_rf, test_metrics_rf)


In [None]:
print("SVC")
# Perform hyperparameter tuning for RandomForest
best_params_svc, train_metrics_svc, test_metrics_svc, elapsed_total_svc = perform_randomized_search(clf_svc, param_grid_svc, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_svc_best = SVC(**best_params_svc)
clf_svc_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_svc = clf_svc_best.predict(X_train)
test_predictions_svc = clf_svc_best.predict(X_test)

# Store results 
svc_results = {
    'best_params': best_params_svc,
    'train_metrics': train_metrics_svc,
    'test_metrics': test_metrics_svc,
    'elapsed_total': elapsed_total_svc,
    'test_predictions': test_predictions_svc
}

# Displaying the model metrics
print("Best Parameters:", best_params_svc)
display_metrics('SVC', train_metrics_svc, test_metrics_svc)


In [None]:
print("ElasticNet")
# Perform hyperparameter tuning for ElasticNet
best_params_en, train_metrics_en, test_metrics_en, elapsed_total_en = perform_randomized_search(clf_en, param_grid_en, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_en_best = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, **best_params_en)
clf_en_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_en = clf_en_best.predict(X_train)
test_predictions_en = clf_en_best.predict(X_test)

# Store results 
en_results = {
    'best_params': best_params_en,
    'train_metrics': train_metrics_en,
    'test_metrics': test_metrics_en,
    'elapsed_total': elapsed_total_en,
    'test_predictions': test_predictions_en
}

# Displaying the model metrics
print("Best Parameters:", best_params_en)
display_metrics('ElasticNet', train_metrics_en, test_metrics_en)


In [None]:
print("Gradient Boosting Classifier")
# Perform hyperparameter tuning for GBoost
best_params_gb, train_metrics_gb, test_metrics_gb, elapsed_total_gb = perform_randomized_search(clf_gb, param_grid_gb, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_gb_best = GradientBoostingClassifier(**best_params_gb)
clf_gb_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_gb = clf_gb_best.predict(X_train)
test_predictions_gb = clf_gb_best.predict(X_test)

# Store results 
gb_results = {
    'best_params': best_params_gb,
    'train_metrics': train_metrics_gb,
    'test_metrics': test_metrics_gb,
    'elapsed_total': elapsed_total_gb,
    'test_predictions': test_predictions_gb
}

# Displaying the model metrics
print("Best Parameters:", best_params_gb)
display_metrics('Gradient Boost', train_metrics_gb, test_metrics_gb)


In [None]:
print("AdaBoost Classifier")
# Perform hyperparameter tuning for AdaBoost
best_params_ada, train_metrics_ada, test_metrics_ada, elapsed_total_ada = perform_randomized_search(clf_ada, param_grid_ada, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_ada_best = AdaBoostClassifier(**best_params_ada)
clf_ada_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_ada = clf_ada_best.predict(X_train)
test_predictions_ada = clf_ada_best.predict(X_test)

# Store results 
ada_results = {
    'best_params': best_params_ada,
    'train_metrics': train_metrics_ada,
    'test_metrics': test_metrics_ada,
    'elapsed_total': elapsed_total_ada,
    'test_predictions': test_predictions_ada
}

# Displaying the model metrics
print("Best Parameters:", best_params_ada)
display_metrics('Ada Boost', train_metrics_ada, test_metrics_ada)


In [None]:
print("XGBoost Classifier")
# Perform hyperparameter tuning for XGBoost
best_params_xgb, train_metrics_xgb, test_metrics_xgb, elapsed_total_xgb = perform_randomized_search(clf_xgb, param_grid_xgb, X_train, y_train, X_test, y_test)

# Fit the model with the best hyperparameters
clf_xgb_best = XGBClassifier(**best_params_xgb)
clf_xgb_best.fit(X_train, y_train)

# Obtain predictions
train_predictions_xgb = clf_xgb_best.predict(X_train)
test_predictions_xgb = clf_xgb_best.predict(X_test)

# Store results 
xgb_results = {
    'best_params': best_params_xgb,
    'train_metrics': train_metrics_xgb,
    'test_metrics': test_metrics_xgb,
    'elapsed_total': elapsed_total_xgb,
    'test_predictions': test_predictions_xgb
}

# Displaying the model metrics
print("Best Parameters:", best_params_xgb)
display_metrics('XGBoost', train_metrics_xgb, test_metrics_xgb)


# Original vs Optimal PCA Model Comparison

In [None]:
# Define model names and their corresponding results
models = ['LR', 'RF', 'SVC', 'EN', 'ADA', 'GB', 'XGB']
original_results = [lr_results, rf_results, svc_results, en_results, ada_results, gb_results, xgb_results]

# Create a list of dictionaries containing the test scores and elapsed total time for each model
pca_scores = []
for model_name, result in zip(models, original_results):
    pca_scores.append({
        'Model': model_name,
        'Accuracy': result['test_metrics']['accuracy'],
        'Precision': result['test_metrics']['precision'],
        'Recall': result['test_metrics']['recall'],
        'F1 Score': result['test_metrics']['f1'],
        'ROC AUC': result['test_metrics']['roc_auc'],
        'Elapsed Total Time (s)': result['elapsed_total']
    })

# Create a DataFrame from the list of dictionaries
pca_scores_df = pd.DataFrame(pca_scores)

# Set the 'Model' column as the index
pca_scores_df.set_index('Model', inplace=True)

# Find the maximum value for each train score
max_accuracy = pca_scores_df['Accuracy'].max()
max_recall = pca_scores_df['Recall'].max()
max_precision = pca_scores_df['Precision'].max()
max_f1 = pca_scores_df['F1 Score'].max()
max_roc_auc = pca_scores_df['ROC AUC'].max()

# Find the minimum value for elapsed total time
min_time = pca_scores_df['Elapsed Total Time (s)'].min()
max_time = pca_scores_df['Elapsed Total Time (s)'].max()

pca_scores_df

In [None]:
# Apply the styling function to highlight the maximum value in each column
styled_pca_df = pca_scores_df.style.apply(highlight_max_min)

# Display the styled DataFrame
styled_pca_df

In [None]:
# Rename columns of pca_scores_df to indicate PCA scores
pca_scores_df.columns = [col + ' (PCA)' for col in pca_scores_df.columns]
pca_scores_df

In [None]:
# Merge original_scores_df and pca_scores_df
merged_scores_df = pd.concat([scores_df, pca_scores_df], axis=1)

# Reordering Columns
desired_order = ['Accuracy', 'Accuracy (PCA)', 'Precision', 'Precision (PCA)', 'Recall', 'Recall (PCA)', 'F1 Score', 'F1 Score (PCA)', 'ROC AUC', 'ROC AUC (PCA)', 'Elapsed Total Time (s)', 'Elapsed Total Time (s) (PCA)']

# Reorder columns
merged_scores_df = merged_scores_df[desired_order]

# Display the merged DataFrame
merged_scores_df


In [None]:
# Apply the styling function to highlight the maximum value in each column
styled_merged_df = merged_scores_df.style.apply(highlight_max_min)

# Display the styled DataFrame
styled_merged_df

In [None]:
##