# Decision Tree from Scratch 1

## 1. Importing Libraries

In [2]:
import numpy as np
import pandas as pd

## 2. Reading Data File

In [3]:
data_file = pd.read_csv("Datasets/person_max_index/500_Person_Gender_Height_Weight_index.csv")
data_file.head()

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3


## 3. Feature Creation

We need to predict whether or noto a person is obese:
* People with an index of 4 or 5 are obese

In [4]:
data_file['obese'] = (data_file.Index >= 4).astype('int')

data_file.drop('Index', axis = 1, inplace = True)
# axis = 1 means it removes the entire 'Index' column (axis = 0 is for rows, axis = 1 is for columns)
# inplace = True means it will not create a new dataset and just change the existing dataset

## 4. Calculating Impurity using Gini Index
Gini Index calculates the amount of probability that a specific characteristic will be classified incorreclty with it is randomly selected

Formula: Gini = 1 - ∑(*Pi*)^2 
* Pi is the probability of having the value

A lower Gini Index indicates more purity. Range from 0 - 1

In [5]:
def gini(y):
    
    if isinstance(y, pd.Series):
        p = y.value_counts()/y.shape[0]
        gini = 1 - np.sum(p**2)
        
        return (gini)
    else:
        raise('Object must be a Pandas Series.')
    
gini(data_file.Gender)

np.float64(0.4998)

With the Gini Index being around 0.5, we can see that there is some impurity.

## 5. Calculating Impurity using Entropy
Another way to measure impurity

Formula: Entropy = ∑ - Pi*log*base2*(Pi)

A lower Entropy value indicates more purity. Same as Gini Index. Range from 0 - 1

In [6]:
def entropy(y):

    if isinstance(y, pd.Series):
        p = y.value_counts()/y.shape[0]
        entropy = np.sum(-p * np.log2(p))

        return(entropy)
    else:
        raise('Object must be a Pandas Series')
    
entropy(data_file.Gender)

np.float64(0.9997114417528099)

With the Entropy being extremely close to 1, we can conclude that there is high impurity.

## 6. Information Gain for Classification
Classification Information Gain Formula: 
* InformationGain = Entropy(parent) - ∑wi * Entropy(child)

In [None]:
def variance(y):

    # If there is only one value in y, then variance is 0
    if(len(y) == 1):
        return 0
    else:
        return y.var()
    
def information_gain(y, mask, func = entropy):

    a = sum(mask)
    b = mask.shape[0] - a

    if (a == 0 or b == 0):
        ig = 0
    else:
        if y.dtypes != 'O':
            ig = variance(y) - (a / (a + b) * variance(y[mask])) - (b / (a + b) * variance(y[-mask]))
        else:
            ig = func(y) - a / (a + b) * func(y[mask]) - b / (a + b) * func(y[-mask])
    
    return ig

In [16]:
def variance(y):

    # If there is only one value in y, then variance is 0
    if(len(y) == 1):
        return 0
    else:
        return y.var(axis = 0)
    

def information_gain(y, mask, func = entropy): 

    a = sum(mask) # Total amount of 'True' values for mask argument
    b = mask.shape[0] - a #Total amount of values in mask column - a

    if(a == 0 or b == 0):
        info_gain = 0
    else:
        #If the data type of the values in y are not objects
        if  y.dtypes != 'O': 
            info_gain = variance




500
