In [2]:
!pip install imblearn



**Let's import some of the main libraries**


In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, classification_report, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler



In [4]:
# Load the dataset
data = pd.read_csv('Bank-term-deposit.csv')

In [5]:
data

Unnamed: 0,Age,Job,Marital,Education,Default,housing,Loan,Contact,Month,day_of_week,...,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [6]:
# data exploration
data.info()
data['y'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           41188 non-null  int64  
 1   Job           41188 non-null  object 
 2   Marital       41188 non-null  object 
 3   Education     41188 non-null  object 
 4   Default       41188 non-null  object 
 5   housing       41188 non-null  object 
 6   Loan          41188 non-null  object 
 7   Contact       41188 non-null  object 
 8   Month         41188 non-null  object 
 9   day_of_week   41188 non-null  object 
 10  duration      41188 non-null  int64  
 11  campaign      41188 non-null  int64  
 12  pdays         41188 non-null  int64  
 13  previous      41188 non-null  int64  
 14  poutcome      41188 non-null  object 
 15  empvarrate    41188 non-null  float64
 16  conspriceidx  41188 non-null  float64
 17  consconfidx   41188 non-null  float64
 18  euribor3m     41188 non-nu

y
no     36548
yes     4640
Name: count, dtype: int64

**Convert categorical data to numerical data**

In [7]:
for col in data.select_dtypes(include=['object']).columns:
  data[col] = data[col].astype('category').cat.codes

In [8]:
data

Unnamed: 0,Age,Job,Marital,Education,Default,housing,Loan,Contact,Month,day_of_week,...,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y
0,56,3,1,0,0,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
1,57,7,1,3,1,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
2,37,7,1,3,0,2,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,1,1,0,0,0,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
4,56,7,1,3,0,0,2,1,6,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,5,1,5,0,2,0,0,7,0,...,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,1
41184,46,1,1,5,0,0,0,0,7,0,...,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,0
41185,56,5,1,6,0,2,0,0,7,0,...,2,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,0
41186,44,9,1,5,0,0,0,0,7,0,...,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,1


In [9]:
#Define features and target
X = data.drop('y', axis=1)
y = data['y']

In [10]:
X

Unnamed: 0,Age,Job,Marital,Education,Default,housing,Loan,Contact,Month,day_of_week,duration,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed
0,56,3,1,0,0,0,0,1,6,1,261,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
1,57,7,1,3,1,0,0,1,6,1,149,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
2,37,7,1,3,0,2,0,1,6,1,226,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
3,40,0,1,1,0,0,0,1,6,1,151,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
4,56,7,1,3,0,0,2,1,6,1,307,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,5,1,5,0,2,0,0,7,0,334,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6
41184,46,1,1,5,0,0,0,0,7,0,383,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6
41185,56,5,1,6,0,2,0,0,7,0,189,2,999,0,1,-1.1,94.767,-50.8,1.028,4963.6
41186,44,9,1,5,0,0,0,0,7,0,442,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6


In [11]:
y

0        0
1        0
2        0
3        0
4        0
        ..
41183    1
41184    0
41185    0
41186    1
41187    0
Name: y, Length: 41188, dtype: int8

In [12]:
print (y.value_counts())

y
0    36548
1     4640
Name: count, dtype: int64


In [13]:
4640/41188 * 100

11.265417111780131

In [14]:
# Train-test split
X_train, x_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [15]:
X_train.value_counts()

Age  Job  Marital  Education  Default  housing  Loan  Contact  Month  day_of_week  duration  campaign  pdays  previous  poutcome  empvarrate  conspriceidx  consconfidx  euribor3m  nremployed
36   5    1        7          0        0        0     1        3      2            88        1         999    0         1          1.4        93.918        -42.7        4.966      5228.1        2
55   7    1        3          1        0        0     0        1      1            33        1         999    0         1          1.4        93.444        -36.1        4.965      5228.1        2
41   9    1        5          0        2        0     0        1      3            127       1         999    0         1          1.4        93.444        -36.1        4.966      5228.1        2
39   1    1        1          0        0        0     1        6      2            124       1         999    0         1          1.1        93.994        -36.4        4.855      5191.0        2
47   9    0        3     

In [16]:
# UnderSampling
# class_0 = X_train[y_train == 0]
# class_1 = X_train[y_train == 1]
# class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)
# X_train undersampled = pd.concat([class_0_downsampled, class_1])
# y_train undersampled = y_train[X_train_undersampled.index]

# undersampling using sklearn
undersampler = RandomUnderSampler(random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)




In [17]:
X_train.value_counts()

Age  Job  Marital  Education  Default  housing  Loan  Contact  Month  day_of_week  duration  campaign  pdays  previous  poutcome  empvarrate  conspriceidx  consconfidx  euribor3m  nremployed
36   5    1        7          0        0        0     1        3      2            88        1         999    0         1          1.4        93.918        -42.7        4.966      5228.1        2
55   7    1        3          1        0        0     0        1      1            33        1         999    0         1          1.4        93.444        -36.1        4.965      5228.1        2
41   9    1        5          0        2        0     0        1      3            127       1         999    0         1          1.4        93.444        -36.1        4.966      5228.1        2
39   1    1        1          0        0        0     1        6      2            124       1         999    0         1          1.1        93.994        -36.4        4.855      5191.0        2
47   9    0        3     

In [18]:
y_train_undersampled.value_counts()

y
0    3712
1    3712
Name: count, dtype: int64

In [19]:
## Oversampling 
oversampler = RandomOverSampler(random_state=42)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)


In [20]:
y_train_oversampled.value_counts()

y
0    29238
1    29238
Name: count, dtype: int64

In [21]:
#SMOTE 
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


In [22]:
y_train_smote.value_counts()

y
0    29238
1    29238
Name: count, dtype: int64

In [23]:
## K-nearest Neighbors(KNN)
knn = KNeighborsClassifier(n_neighbors=5) # value of k 

In [24]:
# Perform Stratified K-fold Cross-Validation for KNN
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
accuracies, precisions, recalls, f1_scores = [], [], [], []


In [25]:
for train_index, val_index  in skf.split(X_train_smote, y_train_smote):
    X_train_fold, X_val_fold = X_train_smote.iloc[train_index], X_train_smote.iloc[val_index]
    y_train_fold, y_val_fold = y_train_smote.iloc[train_index], y_train_smote.iloc[val_index]

    knn.fit(X_train_fold, y_train_fold) #model training

    y_pred = knn.predict(X_val_fold)

    accuracies.append(accuracy_score(y_val_fold, y_pred))
    precisions.append(precision_score(y_val_fold, y_pred))
    recalls.append(recall_score(y_val_fold, y_pred))
    f1_scores.append(f1_score(y_val_fold, y_pred))



In [26]:
print("\nKNN Results:")
print(f"Accuracy: {np.mean(accuracies):.4f} +/- {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} +/- {np.std(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f} +/- {np.std(recalls):.4f}")
print(f"F1 Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}")



KNN Results:
Accuracy: 0.9162 +/- 0.0018
Precision: 0.8650 +/- 0.0032
Recall: 0.9863 +/- 0.0010
F1 Score: 0.9217 +/- 0.0015


In [27]:
# support vector machine(svm)
svm = SVC(kernel='rbf', probability=True)

# Perform Stratified K-fold Cross-Validation for SVM
precision, accuracies, recalls, f1_scores = [], [], [], []


In [28]:
for train_index, val_index in skf.split(X_train_smote, y_train_smote):
    X_train_fold, X_val_fold = X_train_smote.iloc[train_index], X_train_smote.iloc[val_index]
    y_train_fold, y_val_fold = y_train_smote.iloc[train_index], y_train_smote.iloc[val_index]

    svm.fit(X_train_fold, y_train_fold)
    y_pred = svm.predict(X_val_fold)

    accuracies.append(accuracy_score(y_val_fold, y_pred))
    precisions.append(precision_score(y_val_fold, y_pred))
    recalls.append(recall_score(y_val_fold, y_pred))
    f1_scores.append(f1_score(y_val_fold, y_pred))


In [None]:
print("\nSVM Results:")
print(f"Accuracy: {np.mean(accuracies):.4f} +/- {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} +/- {np.std(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f} +/- {np.std(recalls):.4f}")
print(f"F1 Score: {np.mean(f1_scores):.4f} +/- {np.std(f1_scores):.4f}")