In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("bank-full.csv", delimiter=";")
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [4]:
data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [4]:
# Linear Models
from sklearn.linear_model import LogisticRegression # Logistic Reg
# Ensemble Models
from sklearn.ensemble import RandomForestClassifier  # Random Forest
# Boosting Models
from sklearn.ensemble import GradientBoostingClassifier  # GBM
from xgboost import XGBClassifier  # XGBoost
from lightgbm import LGBMClassifier  # LightGBM
# Other Models
from sklearn.tree import DecisionTreeClassifier  # Decision Trees
from sklearn.svm import SVC  # Support Vector Classification
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors (KNN)


# Data processing
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split

# Evaluate
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, recall_score, f1_score, \
precision_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, RandomizedSearchCV

from scipy.stats import randint
# setting
import warnings
warnings.filterwarnings("ignore")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [5]:
data["y"] = data["y"].map({"yes": 1, "no": 0})

data = pd.get_dummies(data)
data

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,False,False,...,False,False,True,False,False,False,False,False,False,True
1,44,29,5,151,1,-1,0,0,False,False,...,False,False,True,False,False,False,False,False,False,True
2,33,2,5,76,1,-1,0,0,False,False,...,False,False,True,False,False,False,False,False,False,True
3,47,1506,5,92,1,-1,0,0,False,True,...,False,False,True,False,False,False,False,False,False,True
4,33,1,5,198,1,-1,0,0,False,False,...,False,False,True,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0,1,False,False,...,False,False,False,True,False,False,False,False,False,True
45207,71,1729,17,456,2,-1,0,1,False,False,...,False,False,False,True,False,False,False,False,False,True
45208,72,5715,17,1127,5,184,3,1,False,False,...,False,False,False,True,False,False,False,False,True,False
45209,57,668,17,508,4,-1,0,0,False,True,...,False,False,False,True,False,False,False,False,False,True


In [6]:
x = data.drop(columns=["y"])
y = data["y"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

print("Training Set Shape:", x_train.shape)
print("Testing Set Shape:", x_test.shape)

Training Set Shape: (36168, 51)
Testing Set Shape: (9043, 51)


In [7]:
scaler = StandardScaler()
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [8]:
log_model = LogisticRegression(random_state=1)
log_model.fit(x_train_scaled, y_train)

log_pred = log_model.predict(x_test_scaled)
log_accuracy = accuracy_score(y_test, log_pred)
print(f"The accuracy score for Logistic Regression is: {log_accuracy}")

The accuracy score for Logistic Regression is: 0.8992590954329316


In [9]:
gbm_model = GradientBoostingClassifier(random_state=1)
gbm_model.fit(x_train_scaled, y_train)

gbm_pred = gbm_model.predict(x_test_scaled)
gbm_accuracy = accuracy_score(y_test, gbm_pred)
print(f"The accuracy score for Gradient Boosting is: {gbm_accuracy}")

The accuracy score for Gradient Boosting is: 0.9047882339931439


In [10]:
svm_model = SVC(random_state=1)
svm_model.fit(x_train_scaled, y_train)

svm_pred = svm_model.predict(x_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"The accuracy score for SVM is: {svm_accuracy}")

The accuracy score for SVM is: 0.9036824062811014


In [11]:
rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(x_train_scaled, y_train)


rf_pred = rf_model.predict(x_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"The accuracy score for Random Forest is: {rf_accuracy}")

The accuracy score for Random Forest is: 0.9060046444763906


In [12]:
xgb_model = XGBClassifier(random_state=1)
xgb_model.fit(x_train_scaled, y_train)

xgb_pred = xgb_model.predict(x_test_scaled)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
print(f"The accuracy score for XGBoost is: {xgb_accuracy}")

The accuracy score for XGBoost is: 0.9081057171292712


In [18]:
models = {
    "log": LogisticRegression(random_state=1),
    "gbm": GradientBoostingClassifier(random_state=1),
    "svm": SVC(random_state=1),
    "rf": RandomForestClassifier(random_state=1),
    "xgb": XGBClassifier(random_state=1)
}

for name, model in models.items():
    model.fit(x_train_scaled, y_train)

In [19]:
for name, model in models.items():
    y_pred = model.predict(x_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9081
Precision: 0.6287
Recall: 0.5095
F1 Score: 0.5629


In [20]:
# Logistic Regression
log_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}
log_grid = GridSearchCV(LogisticRegression(random_state=1), log_param_grid, cv=5)
log_grid.fit(x_train_scaled, y_train)
log_best_model = log_grid.best_estimator_
log_pred = log_best_model.predict(x_test_scaled)
log_accuracy = accuracy_score(y_test, log_pred)
print(f"Best Logistic Regression accuracy: {log_accuracy}, Best Parameters: {log_grid.best_params_}")

Best Logistic Regression accuracy: 0.8992590954329316, Best Parameters: {'C': 0.1, 'solver': 'liblinear'}
