# Decision Tree

In [1]:
# Decision Tree is a tree shaped diagram used to determine a course of action.
# Each branch of the tree represents a possible decision, occurence or a reaction

In [2]:
# Decision tree can be used both for 
# Classification i.e- True-False, 0-1, Yes-No
# & Regression i.e,- figure out next value is in series or group of data

In [3]:
# A classification tree will determine set of logical if-then conditions to classify problems
# Eg- Discriminating between 3 types of flowers based on certain features

In [4]:
# A regression tree is used when target variable is numerical or continuous in nature. We fit in the regression model to the 
# target model using each of the independent variable.
# Each split is made based on the sum of the squared error

In [5]:
# Terms in decision tree
# 1. Entropy - Measure of randomness or unpredictiblity in the dataset
# 2. Information Gain - It is the measure of decrease in entropy after a dataset is split or classified
# 3. Leaf Node - Each leaf carries the classification or decision
# 4. Root Node - Top most decision node is called root node

In [6]:
# Eg- Loan repayment predicition - to find whether customer is gonna repay the loan or not

In [2]:
import numpy as np
import pandas as pd

In [3]:
dataset = pd.read_csv("C:\\Users\\sameer\\Desktop\\Datasets\\Decision_Tree_ Dataset.csv",delimiter = ',')
dataset.head()

Unnamed: 0,1,2,3,4,sum,Unnamed: 5
0,201,10018,250,3046,13515,yes
1,205,10016,395,3044,13660,yes
2,257,10129,109,3251,13746,yes
3,246,10064,324,3137,13771,yes
4,117,10115,496,3094,13822,yes


In [4]:
dataset.columns = ['Initial Payment','Last Payment', 'Credit Score','House Number','sum','Result']

In [5]:
dataset.head()

Unnamed: 0,Initial Payment,Last Payment,Credit Score,House Number,sum,Result
0,201,10018,250,3046,13515,yes
1,205,10016,395,3044,13660,yes
2,257,10129,109,3251,13746,yes
3,246,10064,324,3137,13771,yes
4,117,10115,496,3094,13822,yes


In [6]:
dataset.drop('sum',axis = 1,inplace = True)

In [7]:
dataset.head()

Unnamed: 0,Initial Payment,Last Payment,Credit Score,House Number,Result
0,201,10018,250,3046,yes
1,205,10016,395,3044,yes
2,257,10129,109,3251,yes
3,246,10064,324,3137,yes
4,117,10115,496,3094,yes


In [8]:
dataset.tail()

Unnamed: 0,Initial Payment,Last Payment,Credit Score,House Number,Result
995,413,14914,523,4683,No
996,359,14423,927,4838,No
997,316,14872,613,4760,No
998,305,14926,897,4572,No
999,168,14798,834,4937,No


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [11]:
# Separating the target variable
X = dataset.values[:,0:4] # Independent Variables
Y = dataset.values[:,4:5] # Target Variable

In [12]:
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 100)

In [13]:
# Fuction to perform training with entropy
clf_entropy = DecisionTreeClassifier(criterion='entropy', random_state=100, max_depth=3, min_samples_leaf= 5)
clf_entropy_fit = clf_entropy.fit(X_train,y_train)
clf_entropy_fit

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=100, splitter='best')

In [14]:
# Function to predict
clf_entropy_predict = clf_entropy.predict(X_test)

In [15]:
clf_entropy_predict

array(['yes', 'yes', 'No', 'yes', 'No', 'yes', 'yes', 'yes', 'No', 'No',
       'No', 'No', 'yes', 'No', 'No', 'yes', 'yes', 'No', 'yes', 'No',
       'No', 'yes', 'No', 'yes', 'yes', 'No', 'No', 'yes', 'No', 'No',
       'No', 'yes', 'yes', 'yes', 'yes', 'No', 'No', 'No', 'yes', 'No',
       'yes', 'yes', 'yes', 'No', 'No', 'yes', 'yes', 'yes', 'No', 'No',
       'yes', 'No', 'yes', 'yes', 'yes', 'yes', 'No', 'yes', 'No', 'yes',
       'yes', 'No', 'yes', 'yes', 'No', 'yes', 'yes', 'yes', 'No', 'No',
       'No', 'No', 'No', 'yes', 'No', 'yes', 'yes', 'No', 'yes', 'No',
       'No', 'No', 'No', 'yes', 'No', 'yes', 'No', 'yes', 'yes', 'No',
       'yes', 'yes', 'yes', 'yes', 'yes', 'No', 'yes', 'yes', 'yes',
       'yes', 'No', 'No', 'yes', 'yes', 'No', 'yes', 'yes', 'yes', 'No',
       'yes', 'yes', 'yes', 'yes', 'No', 'No', 'yes', 'yes', 'yes', 'No',
       'No', 'No', 'No', 'yes', 'yes', 'No', 'yes', 'yes', 'yes', 'No',
       'No', 'yes', 'yes', 'No', 'yes', 'yes', 'yes', 'No', 'ye

In [16]:
# Checking accuracy
accuracy = (accuracy_score(y_test,clf_entropy_predict))*100
accuracy

93.66666666666667