In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)


In [2]:
import urllib.request
import zipfile

# Download dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
urllib.request.urlretrieve(url, "bank-additional.zip")

# Extract dataset
with zipfile.ZipFile("bank-additional.zip", 'r') as zip_ref:
    zip_ref.extractall("bank_dataset")

In [3]:
df = pd.read_csv(
    "bank_dataset/bank-additional/bank-additional-full.csv",
    sep=';'
)

df.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})


In [5]:
categorical_cols = df.select_dtypes(include=['object']).columns

le = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


In [7]:
X = df.drop("y", axis=1)
y = df["y"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [8]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [9]:
def evaluate_model(y_test, y_pred, y_prob):
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }


In [10]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

y_pred = lr.predict(X_test_scaled)
y_prob = lr.predict_proba(X_test_scaled)[:, 1]

lr_results = evaluate_model(y_test, y_pred, y_prob)


In [11]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)[:, 1]

dt_results = evaluate_model(y_test, y_pred, y_prob)


In [12]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
y_prob = knn.predict_proba(X_test_scaled)[:, 1]

knn_results = evaluate_model(y_test, y_pred, y_prob)


In [13]:
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)
y_prob = nb.predict_proba(X_test)[:, 1]

nb_results = evaluate_model(y_test, y_pred, y_prob)


In [14]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

rf_results = evaluate_model(y_test, y_pred, y_prob)


In [15]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:, 1]

xgb_results = evaluate_model(y_test, y_pred, y_prob)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [16]:
results_df = pd.DataFrame.from_dict({
    "Logistic Regression": lr_results,
    "Decision Tree": dt_results,
    "KNN": knn_results,
    "Naive Bayes": nb_results,
    "Random Forest": rf_results,
    "XGBoost": xgb_results
}, orient="index")

results_df


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1 Score,MCC
Logistic Regression,0.913935,0.937014,0.700183,0.412716,0.519322,0.495592
Decision Tree,0.895606,0.753019,0.534413,0.568966,0.551148,0.492457
KNN,0.905317,0.861676,0.626712,0.394397,0.484127,0.449126
Naive Bayes,0.853605,0.860651,0.402388,0.617457,0.487245,0.418938
Random Forest,0.920369,0.949055,0.688889,0.534483,0.601942,0.564037
XGBoost,0.916727,0.949524,0.650498,0.563578,0.603926,0.559456


In [17]:
import os
import pickle

if not os.path.exists("model"):
    os.makedirs("model")

pickle.dump(lr, open("model/Logistic_Regression.pkl", "wb"))
pickle.dump(dt, open("model/Decision_Tree.pkl", "wb"))
pickle.dump(knn, open("model/KNN.pkl", "wb"))
pickle.dump(nb, open("model/Naive_Bayes.pkl", "wb"))
pickle.dump(rf, open("model/Random_Forest.pkl", "wb"))
pickle.dump(xgb, open("model/XGBoost.pkl", "wb"))
pickle.dump(scaler, open("model/scaler.pkl", "wb"))

print("All models saved successfully ✅")


All models saved successfully ✅
