**Importing the Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**Importing the dataset**

In [2]:
dataset=pd.read_csv('bank_data.csv')

**Decoding the Dataset**

In [3]:
dataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [5]:
dataset.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [6]:
dataset.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

**Checking for missing values**

In [7]:
dataset.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

Insights : There are no missing values in this dataset

**Encoding the Categorical Variables**

Encoding the Month Column by mapping to respective numbers

In [8]:
month_mapping = {'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04', 'may': '05', 'jun': '06',
                 'jul': '07', 'aug': '08', 'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'}
dataset['month'] = dataset['month'].map(month_mapping)
#dataset['month'] = dataset['month'].astype(int)

Label Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
categorical_variables=['education','default','housing','loan','y']
dataset[categorical_variables] = dataset[categorical_variables].apply(lambda col: le.fit_transform(col))

In [10]:
print(dataset.head())

   age          job  marital  education  default  balance  housing  loan  \
0   30   unemployed  married          0        0     1787        0     0   
1   33     services  married          1        0     4789        1     1   
2   35   management   single          2        0     1350        1     0   
3   30   management  married          2        0     1476        1     1   
4   59  blue-collar  married          1        0        0        1     0   

    contact  day month  duration  campaign  pdays  previous poutcome  y  
0  cellular   19    10        79         1     -1         0  unknown  0  
1  cellular   11    05       220         1    339         4  failure  0  
2  cellular   16    04       185         1    330         1  failure  0  
3   unknown    3    06       199         4     -1         0  unknown  0  
4   unknown    5    05       226         1     -1         0  unknown  0  


One Hot Encoding

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
Encoding_columns=['job','marital','contact','poutcome']
ct=ColumnTransformer(transformers=[('OneHotEncoding',OneHotEncoder(),Encoding_columns)],remainder='passthrough')
dataset=np.array(ct.fit_transform(dataset))

Converting the string datatype to int

In [12]:
dataset[:,29]=dataset[:,29].astype(int)

**Seperating into Dependent and Independent Variabels**

In [39]:
X=dataset[:,:-1]
Y=dataset[:,-1]

In [40]:
X=X.astype(int)
Y=Y.astype(int)

**Splitting into Training and Test Set**



In [41]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

**Training the Decision Tree Classification Model**

In [42]:
from sklearn.tree import DecisionTreeClassifier
Classifier=DecisionTreeClassifier()
Classifier.fit(X_train,Y_train)

**Prediciting the Results on Test Set**

In [43]:
Y_pred=Classifier.predict(X_test)

**Making the Confusion Matrix**

In [44]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm=confusion_matrix(Y_test,Y_pred)
print(cm)
accuracy_score(Y_test,Y_pred)

[[1096   94]
 [  94   73]]


0.8614591009579956