# This is the testing notebook for bens decision tree adventure!

In [1]:
! pip install -r requirements.txt



## Import functions

In [2]:
import pandas as pd

from classification_trees.utility.split_data import split_data_on_float
from classification_trees.classification_tree import Classification_tree
from classification_trees.node import Node

## Load in test dataset

Below is a sample row of this small 200 row dataset. It is a dataset going into medical history of people, and what drugs they use to remedy their illnesses.

In [3]:
df = pd.read_csv('./data/drug200.csv')

In [4]:
df.head(3)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC


### Make cols lower case

In [5]:
df.columns = [col.lower() for col in df.columns]

### Encode non numeric columns

In [6]:
def encode_column(column:str, column_value: str) -> int:
    keys = list(df[column].value_counts().keys())
    keys.sort()
    values = list(range(0,len(keys)))

    encoder = dict(zip(keys,values))

    return encoder[column_value]

In [7]:
columns_to_be_encoded = ["age","sex","bp","cholesterol","drug"]

In [8]:
for column in columns_to_be_encoded:
    df[column] = df[column].apply(lambda column_value: encode_column(column = column, column_value= column_value))

### Make columns binary

In [9]:
df = df[(df['drug'] == 1) | (df['drug'] == 0)]

## Splitting the dataset

This function will split the dataset up into two sets based on some columnar value.

In [10]:
d = split_data_on_float(df, 'age',20,'drug')

### Finding the best row to split the data on

We look through ALL non classification rows, then run split_data_on_float with the unique values from each row. We choose the row that provides the least change in entropy. We want to maximise the information gain!

$$
\text{information gain} = E(\text{parent}) - \sum_i \omega_i E(\text{child}_i)
$$

Where $\omega_i$ is the weight of the two children, such that $\sum_i \omega_i = 1$. We see here that maximising information gain is the same as minimising the sum: $\sum_i \omega_i E(\text{child}_i)$, ergo minimising the total entropy gain and hence maximising the information in each table. 

Remember, we always want entropy to be as low as possible. Entropy is analogous to disorder in this case and we do not want that.

For the non binary case we need to look into **binary cross entropy**.

In [11]:
tree = Classification_tree(df, discrete_columns=['sex', 'bp', 'cholesterol'])
root = tree.root

In [12]:
children = root.find_nodes_children()

In [13]:
root.left.data

Unnamed: 0,age,sex,bp,cholesterol,na_to_k,drug
17,27,1,0,0,13.972,0
36,16,1,0,1,9.445,0
46,21,0,0,0,13.091,0
61,9,1,0,1,9.475,0
66,13,1,0,0,12.856,0
76,20,0,0,0,11.198,0
78,4,0,0,0,13.313,0
83,22,0,0,1,11.326,0
100,15,1,0,1,11.871,0
101,28,0,0,0,12.854,0


In [14]:
root.right.data

Unnamed: 0,age,sex,bp,cholesterol,na_to_k,drug
31,56,1,0,0,9.567,1
41,41,0,0,1,14.239,1
54,51,0,0,1,10.189,1
56,48,1,0,1,11.34,1
64,43,0,0,0,13.303,1
70,53,1,0,0,13.967,1
80,43,1,0,0,13.934,1
85,42,1,0,0,13.935,1
108,54,1,0,1,9.677,1
124,36,0,0,1,12.495,1


## Recursively applying the maximum information gain split until we have a classification tree

The idea now is to apply this logic to each branch of the tree. If we assert that the left branch is the "yes" branch and the right branch is the "no" branch, what i mean by this is if we say: "is x < 5", if yes the left side or bottom of the dataset exists, and so on and so forth.

### How do we approach the recursion?

We want to continue splitting the data until we are only left with leaves at the end. The idea here is to use a **queue** to do the recursion, we run the initial split and populate the queue until all nodes are leaf nodes.


In [15]:
from classification_trees.data_structures.queue import Queue

In [20]:
queue = Queue([tree.root])

while not queue.is_empty():
    node = queue.poll()
    # If the node is not a leaf 
    node.find_nodes_children()
    if not node.leaf:
        left, right = node.left, node.right
        if left:
            queue.add(left)
        if right:
            queue.add(right)

In [21]:
tree.get_nodes()

[{'value': 33, 'column': 'age'},
 {'value': 27, 'column': 'age'},
 {'value': 56, 'column': 'age'},
 {'value': None, 'column': None},
 {'value': None, 'column': None},
 {'value': None, 'column': None},
 {'value': None, 'column': None}]