**<center>===========================================================================================================</center>**
**<center>All Necessary Imports</center>**
**<center>===========================================================================================================</center>**

In [None]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psutil
import pickle

from joblib import Parallel, delayed
from multiprocessing import Lock

from sklearn.ensemble import (
    RandomForestClassifier, 
    AdaBoostClassifier, 
    GradientBoostingClassifier, 
    ExtraTreesClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,  
    precision_recall_curve, 
    roc_auc_score, 
    roc_curve,
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier

# Get the number of logical cores
num_cores = psutil.cpu_count(logical=True)
print(f"Total number of CPU cores: {num_cores}")

# Get the percentage utilization of each core
cpu_percent_per_core = psutil.cpu_percent(percpu=True)
print(f"CPU utilization per core: {cpu_percent_per_core}")

# Calculate the number of free cores (assuming a core is free if its utilization is less than a threshold, e.g., 10%)
threshold = 10.0
free_cores = sum(1 for percent in cpu_percent_per_core if percent < threshold)
print(f"Number of free CPU cores: {free_cores}")


Total number of CPU cores: 32
CPU utilization per core: [79.7, 36.8, 77.9, 32.9, 78.3, 28.6, 75.4, 63.4, 80.0, 27.5, 77.1, 27.5, 81.4, 30.0, 95.7, 28.6, 54.3, 52.2, 52.2, 52.2, 52.2, 50.0, 51.5, 52.9, 52.2, 50.0, 50.7, 52.9, 51.4, 52.2, 50.7, 52.2]
Number of free CPU cores: 0


**<center>===========================================================================================================</center>**
**<center>Load the Data</center>**
**<center>===========================================================================================================</center>**

In [None]:
selected_features_df = pd.read_csv('data/processed/selected_features.csv')

features = selected_features_df.drop(columns=['target'])
target = selected_features_df['target']

**<center>===========================================================================================================</center>**
**<center>Model Performance Evaluation</center>**
**<center>===========================================================================================================</center>**

* __Necessary functions__

In [None]:
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    X_train_copy = X_train.copy()
    y_train_copy = y_train.copy()
    X_test_copy = X_test.copy()
    y_test_copy = y_test.copy()

    model.fit(X_train_copy, y_train_copy)
    train_predictions = model.predict(X_train_copy)
    test_predictions = model.predict(X_test_copy)
    
    train_accuracy = accuracy_score(y_train_copy, train_predictions)
    test_accuracy = accuracy_score(y_test_copy, test_predictions)
    
    test_precision = precision_score(y_test_copy, test_predictions)
    test_recall = recall_score(y_test_copy, test_predictions)
    test_f1 = f1_score(y_test_copy, test_predictions)
    test_auc = roc_auc_score(y_test_copy, model.predict_proba(X_test_copy)[:, 1])
    test_conf_matrix = confusion_matrix(y_test_copy, test_predictions)
    
    # Calculate FPR, TPR, precision, recall
    fpr, tpr, _ = roc_curve(y_test_copy, model.predict_proba(X_test_copy)[:, 1])
    precision, recall, _ = precision_recall_curve(y_test_copy, model.predict_proba(X_test_copy)[:, 1])
    
    results = {
        'name': name,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f1': test_f1,
        'test_auc': test_auc,
        'test_confusion_matrix': test_conf_matrix,
        'fpr': fpr,
        'tpr': tpr,
        'precision': precision,
        'recall': recall
    }
    return results


def results_to_df(results):
    df = pd.DataFrame(results)
    return df


def plot_roc_curves(results, df_results):

    plt.figure(figsize=(10, 8))

    for result in results:
        name = result['name']
        fpr = np.array(result['fpr'])
        tpr = np.array(result['tpr'])
        plt.plot(fpr, tpr, label=f"{name} (AUC = {result['test_auc']:.2f})")
        print(f"Best parameters for {name}: {result['best_params']}")

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

    print(df_results)


checkpoint_file = 'grid_search_checkpoint.pkl'
lock = Lock()

# Load checkpoint if exists
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'rb') as f:
        checkpoint = pickle.load(f)
else:
    checkpoint = {'completed': [], 'results': []}
    
# Function to perform grid search and evaluate model
def grid_search_evaluate_model(name, model, param_grid, X_train, y_train, X_test, y_test, lock):
    X_train_copy = X_train.copy()
    y_train_copy = y_train.copy()
    X_test_copy = X_test.copy()
    y_test_copy = y_test.copy()
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train_copy, y_train_copy)
    best_model = grid_search.best_estimator_
    result = evaluate_model(name, best_model, X_train_copy, y_train_copy, X_test_copy, y_test_copy)
    result['best_params'] = grid_search.best_params_
    
    # Using the lock to ensure only one process writes to the checkpoint file at a time
    with lock:
        checkpoint['results'].append(result)
        checkpoint['completed'].append(name)

        with open(checkpoint_file, 'wb') as f:
            pickle.dump(checkpoint, f)
    
    return result

* __Train-Test Split__

In [None]:
# Split the RFE-applied data into training and testing sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

* __Model Training with Grid Search__

In [None]:
os.environ["PYTHONWARNINGS"] = "ignore"
warnings.filterwarnings("ignore")

# Parameter search space
param_grids = {
    'XGBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 7, 10],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 0.2],
        'reg_alpha': [0, 1],
        'reg_lambda': [1, 10],
        'scale_pos_weight': [1, 10]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.1, 0.5, 1],
        'algorithm': ['SAMME.R']
    },
    'NaiveBayes': {
        'var_smoothing': np.logspace(-9, -1, 20)
    },
    'Decision Tree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 30],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5],
        'max_features': [None, 'sqrt'],
        'class_weight': [None, 'balanced']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 500],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 30],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5],
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True],
        'class_weight': [None, 'balanced']
    },
    'Logistic Regression': {
        'penalty': ['l2'],
        'C': np.logspace(-4, 4, 10),
        'solver': ['liblinear', 'saga'],
        'max_iter': [200],
        'class_weight': [None, 'balanced']
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'class_weight': [None, 'balanced']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    },
    'Extra Trees': {
        'n_estimators': [100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 30],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5],
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True]
    }
}

models = {
    'XGBoost': XGBClassifier(eval_metric='logloss'),
    'AdaBoost': AdaBoostClassifier(),
    'NaiveBayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'Gradient Boosting': GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
}

# Perform parallel grid search and evaluation
results = Parallel(n_jobs=-1, backend='loky')(
    delayed(grid_search_evaluate_model)(name, model, param_grids[name], X_train, y_train, X_test, y_test)
    for name, model in models.items()
)

df_results = results_to_df(results)

plot_roc_curves(results, df_results)

df_results = df_results.drop(columns=['fpr', 'tpr', 'precision', 'recall'])

print(df_results)

for result in results:
    name = result['name']

    print(f"Best parameters for {name}: {result['best_params']}")