# This is the testing notebook for bens decision tree adventure!

In [1]:
! pip install -r requirements.txt



## Import functions

In [2]:
import pandas as pd

from classification_trees.utility.split_data import split_data_on_float
from classification_trees.classification_tree import Classification_tree

## Load in test dataset

Below is a sample row of this small 200 row dataset. It is a dataset going into medical history of people, and what drugs they use to remedy their illnesses.

In [3]:
df = pd.read_csv('data/processed_cleveland_data.csv', header= None)

In [4]:
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


### Make columns binary

In [5]:
def binary_classifier(x: int) -> int:
    if x != 0:
        return 1
    return 0

df[13] = df[13].apply(binary_classifier)

## Splitting the dataset

This function will split the dataset up into two sets based on some columnar value.

In [6]:
d = split_data_on_float(df, 4,20,13)

### Finding the best row to split the data on

We look through ALL non classification rows, then run split_data_on_float with the unique values from each row. We choose the row that provides the least change in entropy. We want to maximise the information gain!

$$
\text{information gain} = E(\text{parent}) - \sum_i \omega_i E(\text{child}_i)
$$

Where $\omega_i$ is the weight of the two children, such that $\sum_i \omega_i = 1$. We see here that maximising information gain is the same as minimising the sum: $\sum_i \omega_i E(\text{child}_i)$, ergo minimising the total entropy gain and hence maximising the information in each table. 

Remember, we always want entropy to be as low as possible. Entropy is analogous to disorder in this case and we do not want that.

For the non binary case we need to look into **binary cross entropy**.

In [7]:
def is_binary(column: pd.Series) -> bool:
    column = column.astype(int)

    mapped_col = set(column)
    if mapped_col == {0,1}:
        return True
    else:
        return False

binary_cols = []
for column in df.columns:
    try:
        if is_binary(df[column]):
            binary_cols.append(column)
    except:
        pass
print(binary_cols)

[1, 5, 8, 13]


In [8]:
tree = Classification_tree(df)
root = tree.root

## Recursively applying the maximum information gain split until we have a classification tree

The idea now is to apply this logic to each branch of the tree. If we assert that the left branch is the "yes" branch and the right branch is the "no" branch. I.e if the inequality is true we go left, otherwise we go right.

### How do we approach the recursion?

We simply descend the tree until we meet a leaf.


In [9]:
def assess_tree(tree: Classification_tree, df: pd.DataFrame) -> None:
    corr_count = 0
    incorr_count = 0

    for index, row in df.iterrows():

        true_val = row[13]

        classification = tree.classify(row)
        
        if classification == true_val:
            corr_count += 1
        else:
            incorr_count += 1
        

    print(f"""
    Success rate: {corr_count / (incorr_count + corr_count):.2%}
    """)
assess_tree(tree,df)


    Success rate: 100.00%
    


## Testing the model

Running the model on a test and train set.

In [10]:
length = len(df)

training_percentage = 0.9

train_size = int(length * training_percentage)

shuffled_df = df.sample(frac = 1)

train = shuffled_df[:train_size]
test = shuffled_df[train_size:]

assert len(train) == train_size
assert len(test) == length - train_size

In [11]:
testing_tree = Classification_tree(train)

In [12]:
assess_tree(testing_tree,test)


    Success rate: 74.19%
    


In [13]:
from classification_trees.random_forest import Random_forest

In [14]:
forest = Random_forest(train, 10)

In [15]:
assess_tree(forest,test)


    Success rate: 54.84%
    


In [16]:
from model_utilities.model_utilities import save_model,load_model

In [17]:
save_model(forest, './model.pkl')


    Saving model...
    


In [18]:
model = load_model('./model.pkl')


    Model size in mb   : 2.05
    Model modifed on   : Saturday, July 29, 2023 11:09:10
    Model created on   : Saturday, July 29, 2023 11:09:10
    
