In [1]:
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
table = pd.DataFrame(iris.data,columns=iris.feature_names)
table.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [2]:
table['target'] = iris.target
table.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
x = table.drop(['target'],axis='columns')
y = iris.target

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [5]:
from sklearn.tree import DecisionTreeClassifier

In [6]:
# Making 4 decision Trees- 
dt1 = DecisionTreeClassifier(criterion='gini',splitter= 'best', max_depth=2)
dt2 = DecisionTreeClassifier(criterion='entropy',splitter='best',max_depth=5)  # 4 decision tree models 
dt3 = DecisionTreeClassifier(criterion='gini',splitter='random',max_depth=4)
dt4 = DecisionTreeClassifier(criterion='entropy',splitter='random',max_depth=6)

In [7]:
dt1.fit(X_train,Y_train)
dt2.fit(X_train,Y_train) # trained all the models but showing the last decision tree only.
dt3.fit(X_train,Y_train)
dt4.fit(X_train,Y_train)

In [8]:
print('dt1 -> ',dt1.score(X_test,Y_test))
print('dt2 -> ',dt2.score(X_test,Y_test)) # entropy performs better
print('dt3 -> ',dt3.score(X_test,Y_test))
print('dt4 -> ',dt4.score(X_test,Y_test))

dt1 ->  0.9666666666666667
dt2 ->  1.0
dt3 ->  1.0
dt4 ->  1.0


In [None]:
## Applying Hyperparameter modules - 

In [9]:
dt = DecisionTreeClassifier()

In [10]:
from sklearn.model_selection import GridSearchCV

In [13]:
options = {'criterion' : ('gini','entropy'),'splitter':('best','random'),'max_depth': list(range(1,7))}

In [14]:
gs = GridSearchCV(dt,options)
gs.fit(X_train,Y_train)

In [15]:
gs.cv_results_['params']

[{'criterion': 'gini', 'max_depth': 1, 'splitter': 'best'},
 {'criterion': 'gini', 'max_depth': 1, 'splitter': 'random'},
 {'criterion': 'gini', 'max_depth': 2, 'splitter': 'best'},
 {'criterion': 'gini', 'max_depth': 2, 'splitter': 'random'},
 {'criterion': 'gini', 'max_depth': 3, 'splitter': 'best'},
 {'criterion': 'gini', 'max_depth': 3, 'splitter': 'random'},
 {'criterion': 'gini', 'max_depth': 4, 'splitter': 'best'},
 {'criterion': 'gini', 'max_depth': 4, 'splitter': 'random'},
 {'criterion': 'gini', 'max_depth': 5, 'splitter': 'best'},
 {'criterion': 'gini', 'max_depth': 5, 'splitter': 'random'},
 {'criterion': 'gini', 'max_depth': 6, 'splitter': 'best'},
 {'criterion': 'gini', 'max_depth': 6, 'splitter': 'random'},
 {'criterion': 'entropy', 'max_depth': 1, 'splitter': 'best'},
 {'criterion': 'entropy', 'max_depth': 1, 'splitter': 'random'},
 {'criterion': 'entropy', 'max_depth': 2, 'splitter': 'best'},
 {'criterion': 'entropy', 'max_depth': 2, 'splitter': 'random'},
 {'criterion

In [16]:
gs.best_params_  # give me the best combination 


{'criterion': 'entropy', 'max_depth': 4, 'splitter': 'random'}

In [17]:
scores = gs.cv_results_['mean_test_score']

In [18]:
scores

array([0.69166667, 0.68333333, 0.95      , 0.9       , 0.93333333,
       0.95      , 0.925     , 0.91666667, 0.93333333, 0.95      ,
       0.925     , 0.9       , 0.69166667, 0.66666667, 0.93333333,
       0.85833333, 0.93333333, 0.90833333, 0.925     , 0.95833333,
       0.925     , 0.94166667, 0.93333333, 0.93333333])

In [19]:
import numpy as np
np.argmax(scores) # 19 th index will show maximum value

19

In [20]:
scores[19]

0.9583333333333334

In [21]:
from sklearn.model_selection import RandomizedSearchCV


In [22]:
dt = DecisionTreeClassifier()

In [27]:
options = {'criterion' :('gini','entropy'),'splitter':('best','random'),'max_depth': list(range(1,7))}

In [28]:
samples = 5 # Here samples means that we are telling the hyperparameter that we need only 5 combinations 
rs = RandomizedSearchCV(dt,param_distributions=options,n_iter=samples)

In [29]:
rs.fit(X_train,Y_train)

In [30]:
rs.cv_results_['params'] #5 randomly generated combinations.

[{'splitter': 'random', 'max_depth': 1, 'criterion': 'entropy'},
 {'splitter': 'best', 'max_depth': 4, 'criterion': 'entropy'},
 {'splitter': 'random', 'max_depth': 1, 'criterion': 'gini'},
 {'splitter': 'random', 'max_depth': 4, 'criterion': 'gini'},
 {'splitter': 'random', 'max_depth': 2, 'criterion': 'entropy'}]

In this randomized search the disadvantage is that we can miss the best optimal combination .

In [31]:
rs.best_params_

{'splitter': 'random', 'max_depth': 4, 'criterion': 'gini'}

In [32]:
scores = rs.cv_results_['mean_test_score']
scores

array([0.68333333, 0.925     , 0.68333333, 0.95      , 0.85833333])

In [33]:
np.argmax(scores)

3

In [34]:
scores[3]

0.95

In [35]:
# Entropy always give best combination 
