##### Implementation of decision tree algorithm using sklearn
##### Dr Afraz Syed, Mohawk College 2022

In [39]:
import numpy as np    # for numerical opertations
import pandas as pd   # for data analysis and manipulation

# for creating the classification model by building a decision tree
from sklearn.tree import DecisionTreeClassifier   

# for splitting data arrays into two subsets: for training data and for testing data
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv('iris.csv')    # a CSV file to DataFrame format

In [5]:
df.head()      # to get the first 5 rows OR to get the first n rows

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [6]:
df['variety'].unique()    # get unique values of a Series object 

array(['Setosa', 'Versicolor', 'Virginica'], dtype=object)

### Separate labels from Features

In [8]:
X = df.drop(['variety'], axis=1)   # drop the column with labels, the rest of the columns are feature coulumns

y = df['variety']      # column with labels will used for classification

In [9]:
X.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [28]:
y    # column with labels will used for classification

0         Setosa
1         Setosa
2         Setosa
3         Setosa
4         Setosa
         ...    
145    Virginica
146    Virginica
147    Virginica
148    Virginica
149    Virginica
Name: variety, Length: 150, dtype: object

### Split the data set into testing and training sets

In [30]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

# notice the size of the splits 0.3 out of 1.0 is for testing (proportion of the dataset)
# run the split again, you will get new split each time of the same size 

- Random_state is used to set the seed for the random generator 
- It ensures that the results that we get can be reproduced
- So the train-test splits are always deterministic 
- If you don't set seed, it is different each time
- if random_state = None : Calling the function multiple times will produce different results.
- if random_state = Integer : Will produce the same results across different calls

In [12]:
X_train

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
86,6.7,3.1,4.7,1.5
143,6.8,3.2,5.9,2.3
30,4.8,3.1,1.6,0.2
107,7.3,2.9,6.3,1.8
139,6.9,3.1,5.4,2.1
...,...,...,...,...
63,6.1,2.9,4.7,1.4
60,5.0,2.0,3.5,1.0
38,4.4,3.0,1.3,0.2
56,6.3,3.3,4.7,1.6


In [13]:
X_test

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
5,5.4,3.9,1.7,0.4
124,6.7,3.3,5.7,2.1
140,6.7,3.1,5.6,2.4
79,5.7,2.6,3.5,1.0
13,4.3,3.0,1.1,0.1
88,5.6,3.0,4.1,1.3
111,6.4,2.7,5.3,1.9
136,6.3,3.4,5.6,2.4
2,4.7,3.2,1.3,0.2
80,5.5,2.4,3.8,1.1


In [14]:
y_train

86     Versicolor
143     Virginica
30         Setosa
107     Virginica
139     Virginica
          ...    
63     Versicolor
60     Versicolor
38         Setosa
56     Versicolor
100     Virginica
Name: variety, Length: 105, dtype: object

In [15]:
y_test

5          Setosa
124     Virginica
140     Virginica
79     Versicolor
13         Setosa
88     Versicolor
111     Virginica
136     Virginica
2          Setosa
80     Versicolor
61     Versicolor
131     Virginica
24         Setosa
133     Virginica
52     Versicolor
17         Setosa
148     Virginica
59     Versicolor
104     Virginica
32         Setosa
103     Virginica
149     Virginica
35         Setosa
132     Virginica
120     Virginica
68     Versicolor
49         Setosa
81     Versicolor
37         Setosa
99     Versicolor
26         Setosa
113     Virginica
71     Versicolor
28         Setosa
77     Versicolor
142     Virginica
145     Virginica
70     Versicolor
114     Virginica
62     Versicolor
39         Setosa
46         Setosa
8          Setosa
115     Virginica
15         Setosa
Name: variety, dtype: object

In [16]:
X_test.count()    
# 0.3 out of 1.0 is for testing (proportion of the dataset) ((default=.25)=.25)

sepal.length    45
sepal.width     45
petal.length    45
petal.width     45
dtype: int64

In [17]:
X_train.count()    # (1-0.3) = 0.7 out of 1.0 is for training

sepal.length    105
sepal.width     105
petal.length    105
petal.width     105
dtype: int64

In [18]:
y_test.count()

45

In [19]:
y_train.count()

105

In [33]:
# you can use len(X_test)  as well

### Create a decision tree classifier

In [32]:
dt_clf = DecisionTreeClassifier()

- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

- Gini and entropy are used to decide the optimal split from a root node

- Gini Impurity tells us what is the probability of misclassifying an observation
-        1-[square of(the probability of yes)]-[square of(the probability of no)]

- Entropy is the the criterion for calculating information gain
- Information gain is the reduction in entropy or surprise by transforming a dataset

- max depth high value will lead to overfitting 
- max depth low value will lead to underfitting 

In [34]:
dt_clf

DecisionTreeClassifier()

### Use fit method to train 

In [40]:
dt_clf.fit(X_train, y_train)     # Build a decision tree classifier from the training set (X, y).

DecisionTreeClassifier()

### Use predict method to test

In [25]:
y_pred = dt_clf.predict(X_test)    # Predict class or regression value for X

In [37]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

accuracy_score(y_test, y_pred)

0.9777777777777777

### Other Methods

In [None]:
dt_clf.get_depth()

In [None]:
dt_clf.get_n_leaves()

In [38]:
print(y_pred[:5])         
print(y_test[:5])

['Setosa' 'Setosa' 'Setosa' 'Setosa' 'Setosa']
47    Setosa
3     Setosa
31    Setosa
25    Setosa
15    Setosa
Name: variety, dtype: object


### Hyperparameter tuning

- hyperparameter tuning means that you want to change the hyperparameters to avoid underfitting and overfitting.
- for example: dt_clf = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=2)
- GridSearchCV is a library function that is a member of sklearn's model_selection package. 
- It helps to loop through predefined hyperparameters and fit your estimator (model) on your training set. 
- So, in the end, you can select the best parameters from the listed hyperparameters.

## Working on dataset with sklearn

In [None]:
### Dataset in sklearn: https://scikit-learn.org/stable/datasets.html
from sklearn.datasets import load_iris     
dataset=load_iris()
dataset

In [None]:
type(df)

In [None]:
dataset.data

In [None]:
dataset.data.shape

In [None]:
dataset.feature_names

In [None]:
dataset.target_names

In [None]:
from sklearn import datasets
dataset1 = datasets.load_breast_cancer()
dataset1

In [None]:
dataset1.feature_names

In [None]:
dataset1.target_names

In [None]:
dataset1.data.shape

## Loading data with NumPy

In [None]:
raw_data = np.loadtxt("example_data.csv", dtype=int, delimiter=",", skiprows=1)

In [None]:
raw_data