In [1]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()

label = pd.Series(iris.target)
features = pd.DataFrame(iris.data, columns=[iris.feature_names])

In [2]:
# label.head()
features.head()
# features.info()
# features.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

#Folds
skfold = StratifiedKFold(n_splits=5) # random_state=8)

#Model
model = DecisionTreeClassifier()
result = cross_val_score(estimator=model, X=features, y=label, cv=skfold)

#Accuracy
print(f'Accuracy in Cross Val: {result.mean():^15.3f}')

Accuracy in Cross Val:      0.967     


### Visualizing the Tree

In [4]:
import graphviz
from sklearn.tree import export_graphviz

In [12]:
# First: create the file that will store the tree
tree_file = r"D:\DAY2DAY\Python_MachineLearning_DidaticaTech\ML_python_dt\9.mod01_decision_tree\tree_file.dot" #.dot is the decision tree extension
model.fit(features, label)



In [13]:
# Second: generating the tree's graph
export_graphviz(model, out_file=tree_file, feature_names=iris.feature_names)

with open(tree_file) as file:
    graph_dot = file.read()

h = graphviz.Source(graph_dot)
h.view()

'Source.gv.pdf'

## Fine tunning the Decision Tree Model

In [9]:
#Testing with normalization
from sklearn.preprocessing import StandardScaler

#scaling the features
standardScaler = StandardScaler()
features_norm = standardScaler.fit_transform(features)
print(features_norm)

[[-9.00681170e-01  1.01900435e+00 -1.34022653e+00 -1.31544430e+00]
 [-1.14301691e+00 -1.31979479e-01 -1.34022653e+00 -1.31544430e+00]
 [-1.38535265e+00  3.28414053e-01 -1.39706395e+00 -1.31544430e+00]
 [-1.50652052e+00  9.82172869e-02 -1.28338910e+00 -1.31544430e+00]
 [-1.02184904e+00  1.24920112e+00 -1.34022653e+00 -1.31544430e+00]
 [-5.37177559e-01  1.93979142e+00 -1.16971425e+00 -1.05217993e+00]
 [-1.50652052e+00  7.88807586e-01 -1.34022653e+00 -1.18381211e+00]
 [-1.02184904e+00  7.88807586e-01 -1.28338910e+00 -1.31544430e+00]
 [-1.74885626e+00 -3.62176246e-01 -1.34022653e+00 -1.31544430e+00]
 [-1.14301691e+00  9.82172869e-02 -1.28338910e+00 -1.44707648e+00]
 [-5.37177559e-01  1.47939788e+00 -1.28338910e+00 -1.31544430e+00]
 [-1.26418478e+00  7.88807586e-01 -1.22655167e+00 -1.31544430e+00]
 [-1.26418478e+00 -1.31979479e-01 -1.34022653e+00 -1.44707648e+00]
 [-1.87002413e+00 -1.31979479e-01 -1.51073881e+00 -1.44707648e+00]
 [-5.25060772e-02  2.16998818e+00 -1.45390138e+00 -1.31544430e

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV

#Defining testing values
min_split = np.array(list(np.arange(2,10)))
max_depth = np.array([3,4,5,6])
algorithm = ['gini', 'entropy']
grid_values = {'min_samples_split': min_split, 'max_depth': max_depth, 'criterion': algorithm}

#Setting the model
tuned_model = DecisionTreeClassifier()

#Creating the grid
gridDecisionTree = GridSearchCV(tuned_model, grid_values, cv=5)
gridDecisionTree.fit(features_norm, label)

#Results
print(f'Minimum split: {gridDecisionTree.best_estimator_.min_samples_split}')
print(f'Maximum depth: {gridDecisionTree.best_estimator_.max_depth}')
print(f'Chosen Algorithm: {gridDecisionTree.best_estimator_.criterion}')
print(f'Accuracy: {gridDecisionTree.best_score_:10.3f}')

Minimum split: 2
Maximum depth: 3
Chosen Algorithm: gini
Accuracy:      0.973


In [7]:
import numpy as np
from sklearn.model_selection import GridSearchCV

#Defining testing values
min_split = np.array(list(np.arange(2,10)))
max_depth = np.array([3,4,5,6])
algorithm = ['gini', 'entropy']
grid_values = {'min_samples_split': min_split, 'max_depth': max_depth, 'criterion': algorithm}

#Setting the model
tuned_model = DecisionTreeClassifier()

#Creating the grid
gridDecisionTree = GridSearchCV(tuned_model, grid_values, cv=5)
gridDecisionTree.fit(features, label)

#Results
print(f'Minimum split: {gridDecisionTree.best_estimator_.min_samples_split}')
print(f'Maximum depth: {gridDecisionTree.best_estimator_.max_depth}')
print(f'Chosen Algorithm: {gridDecisionTree.best_estimator_.criterion}')
print(f'Accuracy: {gridDecisionTree.best_score_:.3f}')

Minimum split: 4
Maximum depth: 3
Chosen Algorithm: gini
Accuracy: 0.973


In [None]:
# First: create the file that will store the tree
tree_file_tuned = r"E:\DAY2DAY\Python_MachineLearning_DidaticaTech\ML_python_dt\9.mod01_decision_tree\tree_file_tuned.dot" #.dot is the decision tree extension
best_model = DecisionTreeClassifier(criterion='gini', min_samples_split=2, max_depth=3)
best_model.fit(features, label)

# Second: generating the tree's graph
export_graphviz(best_model, out_file=tree_file_tuned, feature_names=iris.feature_names)

with open(tree_file_tuned) as file:
    graph_dot = file.read()

h = graphviz.Source(graph_dot)
h.view()

'Source.gv.pdf'