In [1]:
# ========================
# 1. Imports
# ========================
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [2]:
# ========================
# 2. Load Preprocessed Data
# ========================
df = pd.read_csv("Prep_Loan_default.csv")

X = df.drop(columns=["Default"])
y = df["Default"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [3]:
# ========================
# 3. Base Pipeline - Feature Engineering
# ========================
# Common preprocessing steps for all models
base_steps = [
    ("scaler", StandardScaler()),
    ("selector", SelectKBest(score_func=f_classif, k=20)),  # tune k if needed
    ("pca", PCA(n_components=10, random_state=42))          # tune n if needed
]

In [4]:
# ========================
# 4. Define Models & Hyperparameters
# ========================
models = {
    "Logistic Regression": (
        Pipeline(base_steps + [("model", LogisticRegression(max_iter=1000, random_state=42))]),
        {"model__C": [0.01, 0.1, 1, 10], "model__penalty": ["l2"]}
    ),
    "Random Forest": (
        Pipeline(base_steps + [("model", RandomForestClassifier(random_state=42))]),
        {"model__n_estimators": [10, 20], "model__max_depth": [3, 5, None]}
    ),
    "XGBoost": (
        Pipeline(base_steps + [("model", XGBClassifier(eval_metric="logloss", random_state=42))]),
        {"model__n_estimators": [10, 20], "model__max_depth": [3, 5]}
    ),
    #"SVM": (
    #    Pipeline(base_steps + [("model", SVC(probability=True, random_state=42))]),
    #    {"model__C": [0.1, 1, 10], "model__kernel": ["linear", "rbf"]}
    #),
    #"KNN": (
    #    Pipeline(base_steps + [("model", KNeighborsClassifier())]),
    #    {"model__n_neighbors": [3, 5, 7], "model__weights": ["uniform", "distance"]}
    #),
    "Gradient Boosting": (
        Pipeline(base_steps + [("model", GradientBoostingClassifier(random_state=42))]),
        {"model__n_estimators": [10, 20], "model__learning_rate": [0.05, 0.1]}
    ),
    "Naive Bayes": (
        Pipeline(base_steps + [("model", GaussianNB())]),
        {}  # no hyperparameters
    )
}

In [5]:
# ========================
# 5. Train, Tune & Compare
# ========================
results = []
best_models = {}

for name, (pipe, params) in models.items():
    print(f"\nTraining {name} ...")
    if params:
        grid = GridSearchCV(pipe, params, cv=3, scoring="accuracy", n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
    else:
        best_model = pipe.fit(X_train, y_train)
        best_params = {}
    
    # Predictions
    y_pred = best_model.predict(X_test)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    
    print(f"{name} -> Accuracy: {acc:.4f}, F1: {f1:.4f}, Params: {best_params}")
    
    results.append({"Model": name, "Accuracy": acc, "F1 Score": f1, "Best Params": best_params})
    best_models[name] = best_model

results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)
print("\n==== Model Comparison ====")
print(results_df)

# ========================
# 6. Save Best Model Pipeline
# ========================
best_row = results_df.iloc[0]
best_model_name = best_row["Model"]
final_pipeline = best_models[best_model_name]

with open("final_pipeline.pkl", "wb") as f:
    pickle.dump(final_pipeline, f)

print(f"\nFinal pipeline saved as final_pipeline.pkl ({best_model_name})")


Training Logistic Regression ...
Logistic Regression -> Accuracy: 0.8839, F1: 0.8295, Params: {'model__C': 0.01, 'model__penalty': 'l2'}

Training Random Forest ...
Random Forest -> Accuracy: 0.8839, F1: 0.8294, Params: {'model__max_depth': 3, 'model__n_estimators': 10}

Training XGBoost ...
XGBoost -> Accuracy: 0.8839, F1: 0.8307, Params: {'model__max_depth': 5, 'model__n_estimators': 20}

Training Gradient Boosting ...
Gradient Boosting -> Accuracy: 0.8839, F1: 0.8294, Params: {'model__learning_rate': 0.05, 'model__n_estimators': 10}

Training Naive Bayes ...
Naive Bayes -> Accuracy: 0.8839, F1: 0.8295, Params: {}

==== Model Comparison ====
                 Model  Accuracy  F1 Score  \
0  Logistic Regression  0.883904  0.829473   
4          Naive Bayes  0.883885  0.829463   
1        Random Forest  0.883865  0.829378   
2              XGBoost  0.883865  0.830730   
3    Gradient Boosting  0.883865  0.829378   

                                         Best Params  
0         {'mod