# ML Week 5 - Decision Trees
##### Today we will build our second Machine Learning application from scratch (but we have already built one, so we are pros now)
##### We will be building a supervised algorithm called a "Decision Tree". It can learn to efficiently classify objects using pre-determined labels.

### Datset
We will again re-use our simple tools dataset from week 3.

Next week we will apply a decision tree to a real architecture dataset

### How it works:

<div>
<img src="decisionTree.png" width="1000"/>
</div>

## Importing Common Packages
##### You know the routine...

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Read in our dataset

In [2]:
# download tool data and read it into a dataframe
df = pd.read_csv("./tools.csv")

In [3]:
df

Unnamed: 0,Length,Width,Thickness,SharpEdgeWidth,Culture
0,5.1,3.5,1.4,0.2,Bear Culture
1,4.9,3.0,1.4,0.2,Bear Culture
2,4.7,3.2,1.3,0.2,Bear Culture
3,4.6,3.1,1.5,0.2,Bear Culture
4,5.0,3.6,1.4,0.2,Bear Culture
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Beaver Culture
146,6.3,2.5,5.0,1.9,Beaver Culture
147,6.5,3.0,5.2,2.0,Beaver Culture
148,6.2,3.4,5.4,2.3,Beaver Culture


In [4]:
from sklearn.preprocessing import LabelEncoder

X = df[["Length", "Width", "Thickness", "SharpEdgeWidth"]].to_numpy()
Y = df['Culture'].to_numpy()

# Cleaner way:
# X = df.iloc[:,:-1].to_numpy()
# Y = df.iloc[:,-1].to_numpy()

le = LabelEncoder()
Y = le.fit_transform(Y)

Y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
# Depending on ML application it is sometimes better to use a numpy array vs a DataFrame.
# Here we now only have numeric data, and we don't really care about the meanings of the columns beyond this point
# So this is a good time to use a numpy array - it's cleaner and we can manipulate it more easily.
X = X.to_numpy()

In [6]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [None]:
def gini(Y:np.array) -> float:
    gini = 0
    for y in set(Y):
        probability = len(Y[Y==y])/len(Y) # What is the probability of choosing that class?
        gini += probability*(1-probability) # What's the probability we classify it incorrectly?
    return gini

In [16]:
gini(Y)

0.6666666666666667

In [77]:
def find_best_split(X:np.array, Y:np.array) -> tuple[int, float]:
    best_col = None
    best_split = None
    best_gini = gini(Y)
    for col in range(X.shape[1]):
        X_sorted = sorted(set(X[:,col]))[1:]
        for x in X_sorted:
            left_split = Y[X[:,col] < x]
            right_split = Y[X[:,col] >= x]
            new_gini = (len(left_split)/len(Y))*gini(left_split) + (len(right_split)/len(Y))*gini(right_split)
            if new_gini < best_gini:
                best_gini = new_gini
                best_col = col
                best_split = x
                # print(best_col, best_split, best_gini)
    return best_col, best_split

            

In [78]:
find_best_split(X,Y)

(2, np.float64(3.0))

In [79]:
Y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [86]:
def decision_tree(X:np.array, Y:np.array, level=0) -> None:
    if len(set(Y)) > 1:
        best_col, best_split = find_best_split(X,Y)
        print(level, best_col, best_split)
        left_X = X[X[:,best_col] < best_split]
        left_Y = Y[X[:,best_col] < best_split]
        right_X = X[X[:,best_col] >= best_split]
        right_Y = Y[X[:,best_col] >= best_split]
        print("going left")
        decision_tree(left_X, left_Y, level=level+1)
        print("going right")
        decision_tree(right_X, right_Y, level=level+1)
        return None
    else:
        print("leaf node reached")
        return None

In [87]:
decision_tree(X,Y)

0 2 3.0
going left
leaf node reached
going right
1 3 1.8
going left
2 2 5.0
going left
3 3 1.7
going left
leaf node reached
going right
leaf node reached
going right
3 3 1.6
going left
leaf node reached
going right
4 0 7.2
going left
leaf node reached
going right
leaf node reached
going right
2 2 4.9
going left
3 0 6.0
going left
leaf node reached
going right
leaf node reached
going right
leaf node reached
