# Decision Trees

In [344]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [345]:
my_data = pd.read_csv("bank.csv", delimiter=",")
my_data[0:5]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


### Size, Type of Data

In [346]:
print(my_data.shape)

(11162, 17)


In [347]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


In [348]:
to_drop = ['job', 'default', 
           'loan', 
           'contact', 
           'day',
           'month',
           'campaign',
           'pdays',
           'previous',
           'poutcome']

#Melakukan drop pada data yang dipilih
my_data.drop(to_drop, inplace=True, axis=1)

In [349]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   marital    11162 non-null  object
 2   education  11162 non-null  object
 3   balance    11162 non-null  int64 
 4   housing    11162 non-null  object
 5   duration   11162 non-null  int64 
 6   deposit    11162 non-null  object
dtypes: int64(3), object(4)
memory usage: 610.5+ KB


### Pre-processing

In [350]:
X = my_data[['age', 'marital', 'education', 'balance', 'housing', 'duration']].values
X[0:5]

array([[59, 'married', 'secondary', 2343, 'yes', 1042],
       [56, 'married', 'secondary', 45, 'no', 1467],
       [41, 'married', 'secondary', 1270, 'yes', 1389],
       [55, 'married', 'secondary', 2476, 'yes', 579],
       [54, 'married', 'tertiary', 184, 'no', 673]], dtype=object)

In [351]:
from sklearn import preprocessing

le_marital = preprocessing.LabelEncoder()
le_marital.fit([ 'married', 'single', 'divorced'])
X[:,1] = le_marital.transform(X[:,1])

le_education = preprocessing.LabelEncoder()
le_education.fit([ 'secondary', 'tertiary', 'primary', 'unknown'])
X[:,2] = le_education.transform(X[:,2]) 

le_housing = preprocessing.LabelEncoder()
le_housing.fit([ 'yes', 'no'])
X[:,4] = le_housing.transform(X[:,4]) 

X[0:5]

array([[59, 1, 1, 2343, 1, 1042],
       [56, 1, 1, 45, 0, 1467],
       [41, 1, 1, 1270, 1, 1389],
       [55, 1, 1, 2476, 1, 579],
       [54, 1, 2, 184, 0, 673]], dtype=object)

In [352]:
y = my_data["deposit"]
y[0:5]

0    yes
1    yes
2    yes
3    yes
4    yes
Name: deposit, dtype: object

### Setting up the Decision Tree

In [353]:
from sklearn.model_selection import train_test_split

In [354]:
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3)

### The Shape of Trainset and Testset

In [355]:
#The shape of Trainset

print ("Ukuran X_trainset = ", X_trainset.shape)
print ("Ukuran y_trainset = ", y_trainset.shape)

Ukuran X_trainset =  (7813, 6)
Ukuran y_trainset =  (7813,)


In [356]:
#The shape of Testset

print ("Ukuran X_testset = ", X_testset.shape)
print ("Ukuran y_testset = ", y_testset.shape)

Ukuran X_testset =  (3349, 6)
Ukuran y_testset =  (3349,)


### Modeling

In [357]:
depositTree = DecisionTreeClassifier(criterion="entropy", max_depth = 10)
depositTree # it shows the default parameters

DecisionTreeClassifier(criterion='entropy', max_depth=10)

In [358]:
depositTree.fit(X_trainset,y_trainset)

DecisionTreeClassifier(criterion='entropy', max_depth=10)

### Prediction

In [359]:
predTree = depositTree.predict(X_testset)

In [360]:
print (predTree [0:5])
print (y_testset [0:5])

['no' 'yes' 'no' 'no' 'no']
6522     no
454     yes
8723     no
1592    yes
4293    yes
Name: deposit, dtype: object


### Evaluation

In [361]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

DecisionTrees's Accuracy:  0.7626157061809495


### Visualiztion

In [362]:
import six
import sys
from six import StringIO
import pydotplus
import matplotlib.pyplot as plt
from sklearn import tree
import numpy as np

In [363]:
dot_data = StringIO()
filename = "deposittree.png"
featureNames = my_data.columns[0:5]
targetNames = my_data["deposit"].unique().tolist()
out=tree.export_graphviz(depositTree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_trainset), filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

ValueError: Length of feature_names, 5 does not match number of features, 6



