In [44]:
# ==========================================
# MODULE 1: Import Required Libraries
# ==========================================

import os
import joblib
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

In [45]:
# ==========================================
# MODULE 2: Load Dataset
# ==========================================

DATA_PATH = "bank-full.csv"

df = pd.read_csv(DATA_PATH, sep=";")

print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [46]:
# ==========================================
# MODULE 3: Data Preprocessing
# ==========================================

# Convert target variable y â†’ target (1/0)
df["target"] = df["y"].map({"yes": 1, "no": 0})
df.drop("y", axis=1, inplace=True)

# One-hot encoding for categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

X = df_encoded.drop("target", axis=1)
y = df_encoded["target"]

print("Encoded Dataset Shape:", df_encoded.shape)

Encoded Dataset Shape: (45211, 43)


In [47]:
# ==========================================
# MODULE 4: Train-Test Split
# ==========================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

print("Training Shape:", X_train.shape)
print("Testing Shape:", X_test.shape)

Training Shape: (33908, 42)
Testing Shape: (11303, 42)


In [48]:
# ==========================================
# MODULE 5: Feature Scaling
# ==========================================

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [49]:
# ==========================================
# MODULE 6: Define Models
# ==========================================

models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("Decision Tree Classifier", DecisionTreeClassifier(random_state=42)),
    ("K-Nearest Neighbor Classifier", KNeighborsClassifier(n_neighbors=5)),
    ("Naive Bayes Classifier", GaussianNB()),
    ("Ensemble Model - Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("Ensemble Model - XGBoost", XGBClassifier(
        n_estimators=200,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=42
    ))
]

In [50]:
# ==========================================
# MODULE 7: Evaluation Function
# ==========================================

def evaluate_model(y_true, y_pred, y_prob):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_prob),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1 Score": f1_score(y_true, y_pred, zero_division=0),
        "MCC": matthews_corrcoef(y_true, y_pred)
    }

In [51]:
# ==========================================
# MODULE 8: Train & Evaluate Models
# ==========================================

results = []

for name, model in models:
    print(f"\nTraining {name}...")

    # Scaling only for LR & KNN
    if name in ["Logistic Regression", "K-Nearest Neighbor Classifier"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

    metrics = evaluate_model(y_test, y_pred, y_prob)
    metrics["Model"] = name

    results.append(metrics)

print("\nAll models trained successfully.")


Training Logistic Regression...

Training Decision Tree Classifier...

Training K-Nearest Neighbor Classifier...

Training Naive Bayes Classifier...

Training Ensemble Model - Random Forest...

Training Ensemble Model - XGBoost...

All models trained successfully.


In [54]:
# ==========================================
# MODULE 9: Model Comparison Table
# ==========================================

results_df = pd.DataFrame(results)

results_df = results_df[
    ["Model", "Accuracy", "AUC", "Precision", "Recall", "F1 Score", "MCC"]
]

results_df = results_df.round(4)
results_df.to_csv("model_comparison_results.csv", index=False)
results_df

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1 Score,MCC
0,Logistic Regression,0.9015,0.906,0.6441,0.3533,0.4563,0.4295
1,Decision Tree Classifier,0.8759,0.6958,0.4688,0.4607,0.4647,0.3945
2,K-Nearest Neighbor Classifier,0.8938,0.8099,0.5845,0.3192,0.4129,0.38
3,Naive Bayes Classifier,0.8638,0.8227,0.4309,0.5144,0.469,0.3935
4,Ensemble Model - Random Forest,0.9058,0.9264,0.6671,0.388,0.4907,0.4625
5,Ensemble Model - XGBoost,0.9081,0.9314,0.6557,0.4508,0.5343,0.4957


In [53]:
# ==========================================
# MODULE 10: Save Models & Test Data
# ==========================================

MODEL_DIR = "model/saved_models"
os.makedirs(MODEL_DIR, exist_ok=True)

for name, model in models:
    filename = name.lower().replace(" ", "_").replace("-", "") + ".pkl"
    joblib.dump(model, os.path.join(MODEL_DIR, filename))

# Save scaler
joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))

# Save test dataset
test_df = pd.concat([X_test, y_test], axis=1)
test_df.to_csv("test_data.csv", index=False)

print("Models, scaler, and test data saved successfully.")

Models, scaler, and test data saved successfully.
