In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt 

# File paths
data_dir = "./data"

# Loading data 
data_file = ["credit_card_scaled_and_cleaned.csv", "balanced_credit_card.csv", "semi_balanced_credit_card.csv"]
file_path = os.path.join(data_dir, data_file[0])
df = pd.read_csv(file_path, header=0)

file_path = os.path.join(data_dir, data_file[1])
balanced_df = pd.read_csv(file_path, header=0)

file_path = os.path.join(data_dir, data_file[2])
semi_balanced_df = pd.read_csv(file_path, header=0)

### Modeling:

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

Split the Data into Training and Testing Sets
Separate the Class attribute (target) from the rest of the features, and split the dataset into training and testing set

In [None]:
# Split balanced dataset
X_balanced = balanced_df.drop('Class', axis=1)
y_balanced = balanced_df['Class']

X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(
    X_balanced, y_balanced, test_size=0.3, stratify=y_balanced, random_state=42
)


#### Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression

# Define Logistic Regression model and parameters
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_params = {
    'C': [0.01, 0.1, 1, 10],        # Regularization strength
    'penalty': ['l1', 'l2'],        # Type of regularization
    'solver': ['liblinear']         # Solver that supports L1 and L2
}


### SVM:

In [None]:
from sklearn.svm import SVC

# Define SVM model and parameters
svm_model = SVC(probability=True, random_state=42)  # Enable probability=True for ROC AUC
svm_params = {
    'C': [0.1, 1, 10],             # Regularization strength
    'kernel': ['linear', 'rbf'],   # Kernel type
    'gamma': ['scale', 'auto']     # Kernel coefficient
}


### Random Forest:

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define Random Forest model and parameters
rf_model = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [50, 100, 200],   # Number of trees
    'max_depth': [None, 10, 20],      # Maximum depth of each tree
    'min_samples_split': [2, 5],      # Minimum samples required to split a node
    'min_samples_leaf': [1, 2]        # Minimum samples in a leaf node
}


### Defining Scoring Metrics:

In [None]:

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, pos_label=1),
    'recall': make_scorer(recall_score, pos_label=1),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score, response_method='predict_proba')
}


#### Define Stratified K-Fold cross-validator:

In [None]:
## Are these just trained for generally best params or are they trained to have the best params for the data? 
## I feel like we should attempt to train these for our dataset(s) specifically if (if they arent already) so as to get the best results.
## I don't think it's too big a deal though, Our current models are pretty alright (I think)

def train_with_kfold(model, params, model_name):
    grid = GridSearchCV(
        model,
        params,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring=scoring,
        refit='accuracy',
        verbose=1
    )
    grid.fit(X_train_balanced, y_train_balanced)
    
    # Extract cross-validation results
    cv_results = grid.cv_results_
    improvements = []

    print(f"\n{model_name} Cross-Validation Results:")
    print(f"Best Parameters: {grid.best_params_}")

    # Calculate improvements for each metric
    for metric in scoring.keys():
        metric_scores = cv_results[f'mean_test_{metric}']
        top_score = metric_scores.max()
        lowest_score = metric_scores.min()
        improvements.append({
            "Metric": metric.capitalize(),
            "Top Score": top_score,
            "Lowest Score": lowest_score,
            "Difference": top_score - lowest_score
        })
        print(f"{metric.capitalize()}: Top={top_score:.4f}, Lowest={lowest_score:.4f}, Difference={top_score - lowest_score:.4f}")
    
    # Return the trained model and improvements
    return grid.best_estimator_, pd.DataFrame(improvements)

# Train each model
best_lr, lr_improvements = train_with_kfold(lr_model, lr_params, "Logistic Regression")
best_svm, svm_improvements = train_with_kfold(svm_model, svm_params, "Support Vector Machine (SVM)")
best_rf, rf_improvements = train_with_kfold(rf_model, rf_params, "Random Forest")


Fitting 5 folds for each of 8 candidates, totalling 40 fits

Logistic Regression Cross-Validation Results:
Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: Top=0.9433, Lowest=0.7820, Difference=0.1614
Precision: Top=0.9781, Lowest=0.7085, Difference=0.2696
Recall: Top=0.9594, Lowest=0.9012, Difference=0.0582
F1: Top=0.9416, Lowest=0.8148, Difference=0.1268
Roc_auc: Top=0.9852, Lowest=0.9637, Difference=0.0214
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Support Vector Machine (SVM) Cross-Validation Results:
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: Top=0.9462, Lowest=0.9128, Difference=0.0335
Precision: Top=0.9902, Lowest=0.9039, Difference=0.0863
Recall: Top=0.9302, Lowest=0.8460, Difference=0.0843
F1: Top=0.9437, Lowest=0.9115, Difference=0.0322
Roc_auc: Top=0.9825, Lowest=0.9666, Difference=0.0159
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Random Forest Cross-Validation Results:
Best Parame

### Cross-Validation Improvements Table:

In [None]:
print("\nCross-Validation Improvements:")
print("Logistic Regression Improvements:\n", lr_improvements)
print("SVM Improvements:\n", svm_improvements)
print("Random Forest Improvements:\n", rf_improvements)



Cross-Validation Improvements:
Logistic Regression Improvements:
       Metric  Top Score  Lowest Score  Difference
0   Accuracy   0.943330      0.781974    0.161356
1  Precision   0.978102      0.708533    0.269568
2     Recall   0.959378      0.901151    0.058227
3         F1   0.941632      0.814850    0.126782
4    Roc_auc   0.985169      0.963721    0.021448
SVM Improvements:
       Metric  Top Score  Lowest Score  Difference
0   Accuracy   0.946250      0.912779    0.033471
1  Precision   0.990206      0.903868    0.086339
2     Recall   0.930222      0.845951    0.084271
3         F1   0.943685      0.911489    0.032196
4    Roc_auc   0.982480      0.966610    0.015870
Random Forest Improvements:
       Metric  Top Score  Lowest Score  Difference
0   Accuracy   0.937544      0.930276    0.007268
1  Precision   0.977894      0.968325    0.009569
2     Recall   0.895354      0.889557    0.005797
3         F1   0.934498      0.926919    0.007580
4    Roc_auc   0.978219      0.9756