In [1]:
# ========================
# 1. Imports
# ========================
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, classification_report,roc_auc_score,precision_score, recall_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
# ========================
# 2. Load Preprocessed Data
# ========================
df = pd.read_csv("Prep_Loan_default.csv")

X = df.drop(columns=["Default"])
y = df["Default"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

In [3]:
# ========================
# 3. Base Pipeline - Feature Engineering
# ========================
# Common preprocessing steps for all models
base_steps = [
    ("scaler", StandardScaler()),
    ("selector", SelectKBest(score_func=f_classif, k=20)),  # tune k if needed
    ("pca", PCA(n_components=10, random_state=42))          # tune n if needed
]


In [4]:
# ========================
# 4. Define Models & Hyperparameters
# ========================
models = {
    "Logistic Regression": (
        Pipeline(base_steps + [("model", LogisticRegression(max_iter=1000, random_state=42))]),
        {"model__C": [0.01, 0.1, 1, 10], "model__penalty": ["l2"]}
    ),
    "Random Forest": (
        Pipeline(base_steps + [("model", RandomForestClassifier(random_state=42))]),
        {"model__n_estimators": [10, 20], "model__max_depth": [3, 5, None]}
    ),
    "XGBoost": (
        Pipeline(base_steps + [("model", XGBClassifier(eval_metric="logloss", random_state=42))]),
        {"model__n_estimators": [10, 20], "model__max_depth": [3, 5]}
    ),
    #"SVM": (
    #    Pipeline(base_steps + [("model", SVC(probability=True, random_state=42))]),
    #    {"model__C": [0.1, 1, 10], "model__kernel": ["linear", "rbf"]}
    #),
    #"KNN": (
    #    Pipeline(base_steps + [("model", KNeighborsClassifier())]),
    #    {"model__n_neighbors": [3, 5, 7], "model__weights": ["uniform", "distance"]}
    #),
    "Gradient Boosting": (
        Pipeline(base_steps + [("model", GradientBoostingClassifier(random_state=42))]),
        {"model__n_estimators": [10, 20], "model__learning_rate": [0.05, 0.1]}
    ),
    "Naive Bayes": (
        Pipeline(base_steps + [("model", GaussianNB())]),
        {}  # no hyperparameters
    )
}

In [5]:
# ========================
# 5. Train, Tune & Compare
# ========================
results = []
best_models = {}

#Since dataset is imbalaced and never catching positives of Recall for class 1 = 0.01, so that calculate weights

# Calculate class weights: 
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))
print("Class Weights:", class_weight_dict)

for name, (pipe, params) in models.items():
    print(f"\nTraining {name} ...")

    # Special handling for XGBoost (scale_pos_weight)
    if name == "XGBoost":
        # ratio of negative/positive class
        scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
        if "model__scale_pos_weight" not in params:
            params["model__scale_pos_weight"] = [scale_pos_weight]
        print(f"Using scale_pos_weight={scale_pos_weight:.2f}")

    # Special handling for Logistic Regression
    if name == "Logistic Regression":
        if "model__class_weight" not in params:
            params["model__class_weight"] = ["balanced"]
    
    if params:
        grid = GridSearchCV(pipe, params, cv=3, scoring="accuracy", n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
    else:
        best_model = pipe.fit(X_train, y_train)
        best_params = {}
    
     # Predictions
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]  # needed for AUC

   
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    auc = roc_auc_score(y_test, y_proba)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")

    print(f"{name} -> Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    results.append({
        "Model": name,
        "Accuracy": acc,
        "F1 Score": f1,
        "AUC": auc,
        "Precision": precision,
        "Recall": recall,
        "Best Params": best_params
    })
    best_models[name] = best_model

# ========================
# 6. Compare Models
# ========================
results_df = pd.DataFrame(results).sort_values(by="AUC", ascending=False)
print("\n==== Model Comparison (sorted by AUC) ====")
print(results_df)

# ========================
# 7. Classification Report for Best Model
# ========================
best_row = results_df.iloc[0]
best_model_name = best_row["Model"]
final_pipeline = best_models[best_model_name]

print(f"\n==== Classification Report: {best_model_name} ====")
y_pred_best = final_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_best))



Class Weights: {0: 0.5656918944365983, 1: 4.30564454936346}

Training Logistic Regression ...
Logistic Regression -> Accuracy: 0.6166, F1: 0.6855, AUC: 0.6677, Precision: 0.8387, Recall: 0.6166

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.62      0.74     45139
           1       0.18      0.62      0.27      5931

    accuracy                           0.62     51070
   macro avg       0.55      0.62      0.51     51070
weighted avg       0.84      0.62      0.69     51070


Training Random Forest ...
Random Forest -> Accuracy: 0.8839, F1: 0.8294, AUC: 0.6442, Precision: 0.7812, Recall: 0.8839

Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     45139
           1       0.00      0.00      0.00      5931

    accuracy                           0.88     51070
   macro avg       0.44      0.50      0.47     51070
weighted avg       0.78      0.88    

In [6]:
# ========================
# 8. Save Best Model Pipeline
# ========================
with open("final_pipeline.pkl", "wb") as f:
    pickle.dump(final_pipeline, f)

print(f"\nFinal pipeline saved as final_pipeline.pkl ({best_model_name})")


Final pipeline saved as final_pipeline.pkl (XGBoost)


Handling Class Imbalance

In this dataset, the classes are imbalanced: the majority of samples belong to class 0, while class 1 (defaults) are fewer. This imbalance can cause models to predict the majority class more often, leading to high accuracy but poor recall for the minority class.

To address this, I evaluated models not only by accuracy, but also by F1-score and AUC (Area Under ROC Curve), which provide a fairer view of performance on imbalanced data.

Some models (e.g., Logistic Regression) were trained with the class_weight="balanced" option, which automatically adjusts weights inversely to class frequencies. This helped the model pay more attention to the minority class without modifying the dataset.

While oversampling methods such as SMOTE (Synthetic Minority Oversampling Technique) could further improve recall of the minority class, they were not applied here to keep the project simple and focused. This remains an area for future work.

Conclusion:
The evaluation shows that while accuracy is high across all models, metrics such as F1-score and AUC are more informative for imbalanced datasets. Logistic Regression and XGBoost with class balancing achieved the best balance between metrics.