<b>IMPORT RELEVANT LIBRARIES</b>

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier

<b>LOADING DATA</b>

In [2]:
df_train = pd.read_csv('Bank_Train.csv', delimiter=';')
df_test = pd.read_csv('Bank_Test.csv' , delimiter=';')
df_train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
df_train.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [4]:
df_test.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [5]:
print('Bank Marketing Dataset consist of {row} rows'.format(row = len(df_train)))

Bank Marketing Dataset consist of 45211 rows


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


<b>CONVERT OBJECT DATA TYPES TO CATEGORY</b>

In [7]:
cols_to_category = ['job','marital','education','default','housing','loan','contact','month','poutcome','y']
df_train[cols_to_category] = df_train[cols_to_category].astype('category')

In [8]:
df_train['y'] = np.where(df_train['y'] == 'no' , 0 ,1)
df_test['y'] = np.where(df_test['y'] == 'no' , 0 ,1)

<b>DEALING WITH MISSING VALUES</b> 

In [9]:
missing_values = df_train.isnull().any()
print(missing_values)

age          False
job          False
marital      False
education    False
default      False
balance      False
housing      False
loan         False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool


<b>MODEL</b>

In [10]:
train_x , train_y = df_train.drop('y',axis = 1) , df_train.y
train_x.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown


In [11]:
train_y.shape

(45211,)

<B>ENCODING</B>

In [12]:
columns = list(train_x.select_dtypes(include = 'category').columns.values)
columns

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [13]:
columns.remove('education')
columns

['job',
 'marital',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [14]:
train_x = pd.get_dummies(train_x , prefix=columns , columns=columns , drop_first=True)
train_x.head()

Unnamed: 0,age,education,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,58,tertiary,2143,5,261,1,-1,0,False,False,...,False,False,False,True,False,False,False,False,False,True
1,44,secondary,29,5,151,1,-1,0,False,False,...,False,False,False,True,False,False,False,False,False,True
2,33,secondary,2,5,76,1,-1,0,False,True,...,False,False,False,True,False,False,False,False,False,True
3,47,unknown,1506,5,92,1,-1,0,True,False,...,False,False,False,True,False,False,False,False,False,True
4,33,unknown,1,5,198,1,-1,0,False,False,...,False,False,False,True,False,False,False,False,False,True


In [15]:
education_recoder = {'unknown':0,'primary':1,'secondary':2,'tertiary':3}
train_x['education'] = train_x['education'].replace(education_recoder)
train_x['education'].value_counts(normalize=True)

education
2    0.513194
3    0.294198
1    0.151534
0    0.041074
Name: proportion, dtype: float64

In [16]:
test_x = df_test.drop('y',axis = 1)
test_y = df_test.y

In [17]:
test_x = pd.get_dummies(test_x , prefix=columns , columns=columns , drop_first=True)
test_x.head()

Unnamed: 0,age,education,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,30,primary,1787,19,79,1,-1,0,False,False,...,False,False,False,False,False,True,False,False,False,True
1,33,secondary,4789,11,220,1,339,4,False,False,...,False,False,False,True,False,False,False,False,False,False
2,35,tertiary,1350,16,185,1,330,1,False,False,...,False,False,False,False,False,False,False,False,False,False
3,30,tertiary,1476,3,199,4,-1,0,False,False,...,False,True,False,False,False,False,False,False,False,True
4,59,secondary,0,5,226,1,-1,0,True,False,...,False,False,False,True,False,False,False,False,False,True


In [18]:
test_x['education'] = test_x['education'].replace(education_recoder)
test_x['education'].value_counts(normalize=True)

education
2    0.510064
3    0.298607
1    0.149967
0    0.041363
Name: proportion, dtype: float64

In [19]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy' , random_state=0)
classifier.fit(train_x,train_y)

In [20]:
pred_y = classifier.predict(test_x)

In [21]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_y , pred_y)
print(cm)

[[4000    0]
 [   0  521]]


In [22]:
from sklearn.metrics import classification_report
cr = classification_report(test_y , pred_y)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4000
           1       1.00      1.00      1.00       521

    accuracy                           1.00      4521
   macro avg       1.00      1.00      1.00      4521
weighted avg       1.00      1.00      1.00      4521

