In [41]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report , confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [42]:
df = pd.read_csv("bank-full.csv", sep=';')


In [43]:
df.shape

(45211, 17)

In [44]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [45]:
df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [47]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [48]:
df.isnull().sum()

Unnamed: 0,0
age,0
job,0
marital,0
education,0
default,0
balance,0
housing,0
loan,0
contact,0
day,0


In [49]:
df.duplicated().sum()

np.int64(0)

In [51]:
df['y'] = df['y'].map({'yes' : 1 , 'no' : 0})

In [52]:
x = df.drop('y',axis=1)
y = df['y']

In [53]:
x_train , x_test , y_train , y_test = train_test_split(x,y , test_size=0.25 , random_state=1)

In [55]:
x_train.shape, x_test.shape

((33908, 16), (11303, 16))

In [60]:
num_col = x.select_dtypes(include=('float','int')).columns.tolist()

In [62]:
cat_col = x.select_dtypes(include=('object')).columns.tolist()

In [64]:
preprocessor = ColumnTransformer(
                 transformers= [('cat',OneHotEncoder(),cat_col)],
                 remainder = 'passthrough'
                 )

In [65]:
preprocessor

In [66]:
pipeline = Pipeline([
              ('preprocessor',preprocessor),
              ('classifier' , DecisionTreeClassifier(random_state=1))
            ])

In [68]:
pipeline

In [83]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': list(range(2, 20)),
    'classifier__min_samples_split': list(range(2, 10))
}

In [84]:
model = GridSearchCV(pipeline, param_grid=param_grid, cv=5 , verbose = 2 , scoring = 'accuracy' , n_jobs=-1)

In [85]:
model.fit(x_train,y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [101]:
print("Best Parameters : " , model.best_params_)
print("Best Score : " , model.best_score_)

Best Parameters :  {'classifier__criterion': 'gini', 'classifier__max_depth': 6, 'classifier__min_samples_split': 7}
Best Score :  0.9002594852453237


In [105]:
y_pred = model.predict(x_test)

print( "Accuracy_score : " , accuracy_score(y_test , y_pred))
print(classification_report(y_test , y_pred))

Accuracy_score :  0.90020348580023
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      9998
           1       0.61      0.38      0.47      1305

    accuracy                           0.90     11303
   macro avg       0.77      0.67      0.71     11303
weighted avg       0.89      0.90      0.89     11303

