In [1]:
#import linear algebra and data manipulation libraries
import numpy as np
import pandas as pd

#import standard visualization
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import xgboost

from sklearn.model_selection import train_test_split #split
from sklearn.metrics import accuracy_score #metrics

#tools for hyperparameters search
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
inputFile = "/Users/allenayodeji/Desktop/BANK-MARKETING-CAMPAIGN-/bank.csv"
df = pd.read_csv(inputFile, sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [3]:
# number of rows in dataset

print("Bank marketing dataset consists of {rows} rows.".format(rows = len(df)))

Bank marketing dataset consists of 4521 rows.


In [4]:
#Replacing the '-1' elements with the mean of elements in the 'Pdays' column
df = df.replace(to_replace = -1, value = 40.19782796222158)

In [5]:
#Converting the Default Column to Binary 
df['is_default'] = df['default'].apply(lambda row: 1 if row == 'yes' else 0)

In [6]:
df[['default','is_default']].head(10)


Unnamed: 0,default,is_default
0,no,0
1,no,0
2,no,0
3,no,0
4,no,0
5,no,0
6,no,0
7,no,0
8,no,0
9,no,0


In [7]:
#Convert housing column into numeric value
df['is_housing'] = df['housing'].apply(lambda row: 1 if row == 'yes' else 0)
df[['housing','is_housing']].head(10)

Unnamed: 0,housing,is_housing
0,no,0
1,yes,1
2,yes,1
3,yes,1
4,yes,1
5,no,0
6,yes,1
7,yes,1
8,yes,1
9,yes,1


In [8]:
#Convert loan column into numeric value
df['is_loan'] = df['loan'].apply(lambda row: 1 if row == 'yes' else 0)
df[['loan', 'is_loan']].tail(10)

Unnamed: 0,loan,is_loan
4511,no,0
4512,no,0
4513,no,0
4514,no,0
4515,no,0
4516,no,0
4517,yes,1
4518,no,0
4519,no,0
4520,yes,1


In [9]:
# Convert column ‘y’ into numeric value
df['target'] = df['y'].apply(lambda row: 1 if row == 'yes' else 0)
df[['y', 'target']].tail(10)

Unnamed: 0,y,target
4511,yes,1
4512,no,0
4513,no,0
4514,no,0
4515,no,0
4516,no,0
4517,no,0
4518,no,0
4519,no,0
4520,no,0


In [10]:
#Creating one-hot encoding for non-numeric marital column
marital_dummies = pd.get_dummies(df['marital'], prefix = 'marital')

#Merge marital_dummies with marital column
pd.concat([df['marital'], marital_dummies], axis=1)

# It is important to eliminate any redundancy and correlations in features as it becomes difficult to determine which feature is most important in minimizing the total error.
marital_dummies.drop('marital_divorced', axis=1, inplace=True)

#Merge marital_dummies into main dataframe
df = pd.concat([df, marital_dummies], axis=1)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,pdays,previous,poutcome,y,is_default,is_housing,is_loan,target,marital_married,marital_single
0,30,unemployed,married,primary,no,1787.0,no,no,cellular,19,...,40.197828,0,unknown,no,0,0,0,0,1,0
1,33,services,married,secondary,no,4789.0,yes,yes,cellular,11,...,339.0,4,failure,no,0,1,1,0,1,0
2,35,management,single,tertiary,no,1350.0,yes,no,cellular,16,...,330.0,1,failure,no,0,1,0,0,0,1
3,30,management,married,tertiary,no,1476.0,yes,yes,unknown,3,...,40.197828,0,unknown,no,0,1,1,0,1,0
4,59,blue-collar,married,secondary,no,0.0,yes,no,unknown,5,...,40.197828,0,unknown,no,0,1,0,0,1,0


In [11]:
#Creating one hot encoding for job column
job_dummies = pd.get_dummies(df['job'], prefix = 'job')

#Dropping the unknown column
job_dummies.drop('job_unknown', axis=1, inplace=True)

#Merge job_dummies into main dataframe
df = pd.concat([df, job_dummies], axis=1)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed
0,30,unemployed,married,primary,no,1787.0,no,no,cellular,19,...,0,0,0,0,0,0,0,0,0,1
1,33,services,married,secondary,no,4789.0,yes,yes,cellular,11,...,0,0,0,0,0,0,1,0,0,0
2,35,management,single,tertiary,no,1350.0,yes,no,cellular,16,...,0,0,0,1,0,0,0,0,0,0
3,30,management,married,tertiary,no,1476.0,yes,yes,unknown,3,...,0,0,0,1,0,0,0,0,0,0
4,59,blue-collar,married,secondary,no,0.0,yes,no,unknown,5,...,1,0,0,0,0,0,0,0,0,0


In [12]:
#Creating one hot encoding for education column
education_dummies = pd.get_dummies(df['education'], prefix = 'education')

#Dropping the Unknown column
education_dummies.drop('education_unknown', axis=1, inplace=True)

#Merging education_dummies into main dataframe
df = pd.concat([df, education_dummies], axis=1)
df.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,education_primary,education_secondary,education_tertiary
0,30,unemployed,married,primary,no,1787.0,no,no,cellular,19,...,0,0,0,0,0,0,1,1,0,0
1,33,services,married,secondary,no,4789.0,yes,yes,cellular,11,...,0,0,0,1,0,0,0,0,1,0
2,35,management,single,tertiary,no,1350.0,yes,no,cellular,16,...,1,0,0,0,0,0,0,0,0,1
3,30,management,married,tertiary,no,1476.0,yes,yes,unknown,3,...,1,0,0,0,0,0,0,0,0,1
4,59,blue-collar,married,secondary,no,0.0,yes,no,unknown,5,...,0,0,0,0,0,0,0,0,1,0


In [13]:
#Creating one hot encoding for contact column
contact_dummies = pd.get_dummies(df['contact'], prefix = 'contact')

#Dropping the contact_unknown column
contact_dummies.drop('contact_unknown', axis=1, inplace=True)

#Merging contact_dummies into main dataframe
df = pd.concat([df, contact_dummies], axis=1)
df.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_self-employed,job_services,job_student,job_technician,job_unemployed,education_primary,education_secondary,education_tertiary,contact_cellular,contact_telephone
0,30,unemployed,married,primary,no,1787.0,no,no,cellular,19,...,0,0,0,0,1,1,0,0,1,0
1,33,services,married,secondary,no,4789.0,yes,yes,cellular,11,...,0,1,0,0,0,0,1,0,1,0
2,35,management,single,tertiary,no,1350.0,yes,no,cellular,16,...,0,0,0,0,0,0,0,1,1,0
3,30,management,married,tertiary,no,1476.0,yes,yes,unknown,3,...,0,0,0,0,0,0,0,1,0,0
4,59,blue-collar,married,secondary,no,0.0,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0


In [14]:
#Creating one hot encoding for poutcome column
poutcome_dummies = pd.get_dummies(df['poutcome'], prefix = 'poutcome')

#Dropping the poutcome_unknown
poutcome_dummies.drop('poutcome_unknown', axis=1, inplace=True)

#Merging poutcome_dummies into main dataframe
df = pd.concat([df, poutcome_dummies], axis=1)
df.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_technician,job_unemployed,education_primary,education_secondary,education_tertiary,contact_cellular,contact_telephone,poutcome_failure,poutcome_other,poutcome_success
0,30,unemployed,married,primary,no,1787.0,no,no,cellular,19,...,0,1,1,0,0,1,0,0,0,0
1,33,services,married,secondary,no,4789.0,yes,yes,cellular,11,...,0,0,0,1,0,1,0,1,0,0
2,35,management,single,tertiary,no,1350.0,yes,no,cellular,16,...,0,0,0,0,1,1,0,1,0,0
3,30,management,married,tertiary,no,1476.0,yes,yes,unknown,3,...,0,0,0,0,1,0,0,0,0,0
4,59,blue-collar,married,secondary,no,0.0,yes,no,unknown,5,...,0,0,0,1,0,0,0,0,0,0


In [15]:
#Convert month column into numeric value
months = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6, 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec': 12}
df['month'] = df['month'].map(months)
df['month'].head()



0    10
1     5
2     4
3     6
4     5
Name: month, dtype: int64

In [16]:
#For the Pdays Column, previously we replaced the '-1' with the mean of the column but this can affect our model due to the size of values we covertedf
#If the value of ‘pdays’ is ‘-1’, if so we will associate that with a value of 0,


df['was_contacted'] = df['pdays'].apply(lambda row: 0 if row == 40.19782796222158 else 1)
df[['pdays','was_contacted']].head()

Unnamed: 0,pdays,was_contacted
0,40.197828,0
1,339.0,1
2,330.0,1
3,40.197828,0
4,40.197828,0


In [17]:
#Deleting all unnecessary Columns for our model
df.drop(['job', 'education', 'marital', 'default', 'housing', 'loan', 'contact', 'pdays', 'poutcome', 'y'], axis=1, inplace=True)

In [18]:
df.head()

Unnamed: 0,age,balance,day,month,duration,campaign,previous,is_default,is_housing,is_loan,...,job_unemployed,education_primary,education_secondary,education_tertiary,contact_cellular,contact_telephone,poutcome_failure,poutcome_other,poutcome_success,was_contacted
0,30,1787.0,19,10,79,1,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
1,33,4789.0,11,5,220,1,4,0,1,1,...,0,0,1,0,1,0,1,0,0,1
2,35,1350.0,16,4,185,1,1,0,1,0,...,0,0,0,1,1,0,1,0,0,1
3,30,1476.0,3,6,199,4,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
4,59,0.0,5,5,226,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [19]:
df.dtypes

age                      int64
balance                float64
day                      int64
month                    int64
duration                 int64
campaign                 int64
previous                 int64
is_default               int64
is_housing               int64
is_loan                  int64
target                   int64
marital_married          uint8
marital_single           uint8
job_admin.               uint8
job_blue-collar          uint8
job_entrepreneur         uint8
job_housemaid            uint8
job_management           uint8
job_retired              uint8
job_self-employed        uint8
job_services             uint8
job_student              uint8
job_technician           uint8
job_unemployed           uint8
education_primary        uint8
education_secondary      uint8
education_tertiary       uint8
contact_cellular         uint8
contact_telephone        uint8
poutcome_failure         uint8
poutcome_other           uint8
poutcome_success         uint8
was_cont

In [20]:
c

In [21]:
#let's check the shape of the X and y
X.shape


(4521, 32)

In [22]:
y.shape

(4521,)

In [23]:
#Dividing features and target into train and test data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 32)

In [24]:
#View the shape of X_train, X_test, y_train, y_test
X_train.shape

(3616, 32)

In [25]:
y_train.shape


(3616,)

In [26]:
X_test.shape

(905, 32)

In [27]:
y_test.shape

(905,)

## Logistic Regression Model

In [36]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(max_iter=5000)
model.fit(X_train,y_train)

LogisticRegression(max_iter=5000)

In [38]:
y_pred = model.predict(X_test)

In [39]:
#Printing predicted and actual value
print("Predicted value: ", y_pred[:10])
print("Actual value: ", y_test[:10])

Predicted value:  [0 0 0 0 0 0 0 0 0 0]
Actual value:  4519    0
784     0
25      0
2461    0
960     0
3878    0
3399    0
1262    0
760     0
1175    0
Name: target, dtype: int64


In [40]:
#Accuracy of the Model

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred = y_pred, y_true = y_test)
print(f'Accuracy of the model Logistic Regression is {accuracy*100:.2f}%')

Accuracy of the model Logistic Regression is 91.38%


## Random Forest Classifier

In [41]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfcpredictions = rfc.predict(X_test)
print("Predicted value: ", rfcpredictions[:10])
print("Actual value: ", y_test[:10])

Predicted value:  [0 0 0 0 0 0 0 0 0 1]
Actual value:  4519    0
784     0
25      0
2461    0
960     0
3878    0
3399    0
1262    0
760     0
1175    0
Name: target, dtype: int64


In [42]:
# Accuracy of the model
accuracy = accuracy_score(y_pred = rfcpredictions, y_true = y_test)
print(f'Accuracy of the Random Forest Classifier model is {accuracy*100:.2f}%')

Accuracy of the Random Forest Classifier model is 91.16%


## SVC (support Vector Classifier)

In [43]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
svcpredictions = svc.predict(X_test)
print("Predicted value: ", svcpredictions[:10])
print("Actual value: ", y_test[:10])

Predicted value:  [0 0 0 0 0 0 0 0 0 0]
Actual value:  4519    0
784     0
25      0
2461    0
960     0
3878    0
3399    0
1262    0
760     0
1175    0
Name: target, dtype: int64


In [44]:
#Accuracy of the model
accuracy = accuracy_score(y_pred = svcpredictions, y_true = y_test)
print(f'Accuracy of the SVC model is {accuracy*100:.2f}%')

Accuracy of the SVC model is 89.61%


## Decision Tree Classifier

In [45]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=0)
dtc.fit(X_train, y_train)
dtcprediction = dtc.predict(X_test)
print("Predicted value: ", dtcprediction[:10])
print("Actual value: ", y_test[:10])

Predicted value:  [0 0 0 0 0 0 0 0 0 1]
Actual value:  4519    0
784     0
25      0
2461    0
960     0
3878    0
3399    0
1262    0
760     0
1175    0
Name: target, dtype: int64


In [46]:
# Accuracy of the model
accuracy = accuracy_score(y_pred = dtcprediction, y_true = y_test)
print(f'Accuracy of the Decision Tree Classifier model is {accuracy*100:.2f}%')

Accuracy of the Decision Tree Classifier model is 87.29%
