In [1]:
import numpy as np
import pandas as pd


In [2]:
# a dummy dataset, since the goal of this notebook is to understand the implementation of decision trees, not solve a real world problem

data = {
    'Popcorn': [True, True, False, False, True, True, False],
    'Soda': [True, False, True, True, True, False, False],
    'Age': [7, 12, 18, 35, 38, 50, 83],
    'Likes Some Activity': [False, False, True, True, True, False, False]
}

df = pd.DataFrame(data)

In [3]:
def merge_avg_ages(ages, df):

    # adding averaged ages before any processing to be done
    
    # # make averages
    # ages = df["Age"].to_numpy()
    
    averaged_ages = []
    for i in range(ages.shape[0] - 1):
        averaged_ages.append((ages[i] + ages[i + 1]) / 2)
    
    print('Ages', ages)
    print('Averaged Ages', averaged_ages)

    for avg_age in averaged_ages:
        df[str(avg_age)] = [True if age < avg_age else False for age in ages]

    return df

In [4]:
df = merge_avg_ages(df["Age"].to_numpy(), df)

Ages [ 7 12 18 35 38 50 83]
Averaged Ages [9.5, 15.0, 26.5, 36.5, 44.0, 66.5]


In [5]:
df

Unnamed: 0,Popcorn,Soda,Age,Likes Some Activity,9.5,15.0,26.5,36.5,44.0,66.5
0,True,True,7,False,True,True,True,True,True,True
1,True,False,12,False,False,True,True,True,True,True
2,False,True,18,True,False,False,True,True,True,True
3,False,True,35,True,False,False,False,True,True,True
4,True,True,38,True,False,False,False,False,True,True
5,True,False,50,False,False,False,False,False,False,True
6,False,False,83,False,False,False,False,False,False,False


In [6]:
def calculate_node(col_name, df):
    feature_classes = list(set(df[col_name]))
    
    # a nested dictionary shouldbeb able to solve this (ig xD)
    
    feature_entropies = {}
    for feature_class in feature_classes:
    
        id = {}
        for value in df.loc[df[col_name] == feature_class]['Likes Some Activity']:
            if value not in list(id.keys()):
                id[value] = 0
            id[value] += 1
    
        feature_entropies[feature_class] = id


    get_entropy_vals(feature_entropies, col_name)
    

In [7]:
def get_entropy_vals(feature_entropies, col_name):

    try:
        # finding and storing samples from all classes for each node
        total_samples_per_node = []
        for feature_class_node in list(feature_entropies.keys()):
            total_samples_node = sum(list(feature_entropies[feature_class_node].values()))
            total_samples_per_node.append(total_samples_node)
        
        
        # print(total_samples_per_node)
        
        total_split_entropy = 0
        t = 0
        for feature_class_node in list(feature_entropies.keys()):
            node_entropy = 1
        
            # value is the probability of each class\
            # value = number of samples from a class / total samples from the node
            for value in (list(np.array(list(feature_entropies[feature_class_node].values())) / total_samples_per_node[t])):
        
                # subtracting (pi)^2 impurity from pure node value of 1 for each node
                node_entropy -= (value ** 2)
        
            # printing per node entropy
            print(f'Node entropy for {col_name} {feature_class_node} is => {node_entropy}')
        
            # iteratively adding to total split entropy to make decision about split
            total_split_entropy += ((total_samples_per_node[t] / sum(total_samples_per_node)) * node_entropy)
        
            t += 1
        
        
        print(f'Total split entropy is => {total_split_entropy}\n\n')

    except:
        print(f'Error while calculating for column {col_name}\n\n')

In [8]:
for col in df.columns:

    # print(col)

    calculate_node(col, df)
    

Node entropy for Popcorn False is => 0.4444444444444445
Node entropy for Popcorn True is => 0.375
Total split entropy is => 0.40476190476190477


Node entropy for Soda False is => 0.0
Node entropy for Soda True is => 0.375
Total split entropy is => 0.21428571428571427


Node entropy for Age 35 is => 0.0
Node entropy for Age 38 is => 0.0
Node entropy for Age 7 is => 0.0
Node entropy for Age 12 is => 0.0
Node entropy for Age 18 is => 0.0
Node entropy for Age 50 is => 0.0
Node entropy for Age 83 is => 0.0
Total split entropy is => 0.0


Node entropy for Likes Some Activity False is => 0.0
Node entropy for Likes Some Activity True is => 0.0
Total split entropy is => 0.0


Node entropy for 9.5 False is => 0.5
Node entropy for 9.5 True is => 0.0
Total split entropy is => 0.42857142857142855


Node entropy for 15.0 False is => 0.48
Node entropy for 15.0 True is => 0.0
Total split entropy is => 0.34285714285714286


Node entropy for 26.5 False is => 0.5
Node entropy for 26.5 True is => 0.44444

In [9]:
# we need to split at soda

In [10]:
calculate_node('Soda', df)

Node entropy for Soda False is => 0.0
Node entropy for Soda True is => 0.375
Total split entropy is => 0.21428571428571427




In [11]:
# since Soda False is pure, we further split Soda true
df_split2 = df.loc[df['Soda'] == True]
df_split2 = df_split2.iloc[:, :df_split2.columns.get_loc('Likes Some Activity') + 1]
df_split2

Unnamed: 0,Popcorn,Soda,Age,Likes Some Activity
0,True,True,7,False
2,False,True,18,True
3,False,True,35,True
4,True,True,38,True


In [12]:
df_split2 = merge_avg_ages(df_split2['Age'].to_numpy(), df_split2)

Ages [ 7 18 35 38]
Averaged Ages [12.5, 26.5, 36.5]


In [13]:
for col in df_split2.columns:
    calculate_node(col, df_split2)

Node entropy for Popcorn False is => 0.0
Node entropy for Popcorn True is => 0.5
Total split entropy is => 0.25


Node entropy for Soda True is => 0.375
Total split entropy is => 0.375


Node entropy for Age 18 is => 0.0
Node entropy for Age 35 is => 0.0
Node entropy for Age 38 is => 0.0
Node entropy for Age 7 is => 0.0
Total split entropy is => 0.0


Node entropy for Likes Some Activity False is => 0.0
Node entropy for Likes Some Activity True is => 0.0
Total split entropy is => 0.0


Node entropy for 12.5 False is => 0.0
Node entropy for 12.5 True is => 0.0
Total split entropy is => 0.0


Node entropy for 26.5 False is => 0.0
Node entropy for 26.5 True is => 0.5
Total split entropy is => 0.25


Node entropy for 36.5 False is => 0.0
Node entropy for 36.5 True is => 0.4444444444444444
Total split entropy is => 0.3333333333333333




In [14]:
# based on the above entropy values, we split at an age of 12.5; after which all our nodes are pure