In [7]:
#Step 1: Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
import joblib
from joblib import dump, load
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef,
    classification_report
)

In [8]:
#Step 2: Load Dataset
data = pd.read_csv("bank-full.csv", sep=";")

print(data.shape)
data.head()


(45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [9]:

#Step 3: Preprocessing
# Target encoding
data['y'] = data['y'].map({'yes': 1, 'no': 0})

X = data.drop('y', axis=1)
y = data['y']

# Identify categorical & numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [10]:
#Step 4: Define All 6 Models
models = {

    "Logistic Regression": LogisticRegression(max_iter=1000),

    "Decision Tree": DecisionTreeClassifier(random_state=42),

    "KNN": KNeighborsClassifier(n_neighbors=5),

    "Naive Bayes": GaussianNB(),

    "Random Forest (Ensemble)": RandomForestClassifier(n_estimators=100, random_state=42),

    "XGBoost (Ensemble)": XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
}


In [11]:
#Step 5: Train & Evaluate All Models
results = []

for name, model in models.items():

    # Special handling for GaussianNB (needs dense array)
    if name == "Naive Bayes":
        X_train_processed = preprocessor.fit_transform(X_train)
        X_test_processed = preprocessor.transform(X_test)

        model.fit(X_train_processed, y_train)
        joblib.dump((preprocessor, model), f"bank_model_{name}.pkl",compress=3)
        y_pred = model.predict(X_test_processed)
        y_prob = model.predict_proba(X_test_processed)[:, 1]

    else:
        pipe = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        pipe.fit(X_train, y_train)
        joblib.dump(pipe, f"bank_model_{name}.pkl",compress=3)
        y_pred = pipe.predict(X_test)
        y_prob = pipe.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)


    results.append([name, accuracy, auc, precision, recall, f1, mcc])

    print(f"\n{name}")
    print(classification_report(y_test, y_pred))



Logistic Regression
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.64      0.35      0.45      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.66      0.70      9043
weighted avg       0.89      0.90      0.89      9043


Decision Tree
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      7985
           1       0.46      0.48      0.47      1058

    accuracy                           0.87      9043
   macro avg       0.70      0.70      0.70      9043
weighted avg       0.88      0.87      0.88      9043


KNN
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7985
           1       0.60      0.34      0.43      1058

    accuracy                           0.90      9043
   macro avg       0.76      0.66      0.69      9043
weighted avg       0.88      0.90

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost (Ensemble)
              precision    recall  f1-score   support

           0       0.93      0.96      0.95      7985
           1       0.63      0.47      0.54      1058

    accuracy                           0.91      9043
   macro avg       0.78      0.72      0.74      9043
weighted avg       0.90      0.91      0.90      9043



In [12]:
#Step 6: Compare Model Performance
results_df = pd.DataFrame(results, 
                          columns=["Model", "Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"])

results_df.sort_index()


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.90125,0.905574,0.644483,0.347826,0.451811,0.426058
1,Decision Tree,0.874599,0.701457,0.46488,0.475425,0.470093,0.399021
2,KNN,0.896163,0.827721,0.599002,0.340265,0.433996,0.400128
3,Naive Bayes,0.854805,0.810095,0.405904,0.519849,0.455864,0.377358
4,Random Forest (Ensemble),0.904456,0.926286,0.650621,0.39603,0.492362,0.459724
5,XGBoost (Ensemble),0.905452,0.928747,0.626717,0.47448,0.540075,0.494383
