In [20]:
import numpy as np
import matplotlib.pyplot as plt
from public_tests import *
from utils import *
import pandas as pd


In [36]:
X_train = np.array([[1,1,1],[1,0,1],[1,0,0],[1,0,0],[1,1,1],[0,1,1],[0,0,0],[1,0,1],[0,1,0],[1,0,0]])
y_train = np.array([1,1,0,0,1,0,0,1,1,0])

In [22]:
X_train=pd.DataFrame(X_train)
y_train=pd.DataFrame(y_train)

In [24]:
import math
def calculate_entropy(y):
    entropy=0
    if len(y):
        p1=len(y[y==1])/len(y)
        if p1!=0 and p1!=1:
            entropy=(-p1*(math.log2(p1)))-(1-p1)*(math.log2(1-p1))
    return entropy

In [26]:
print("Entropy at root node: ", calculate_entropy(y_train)) 

# UNIT TESTS
compute_entropy_test(calculate_entropy)

Entropy at root node:  0
[92m All tests passed.


In [38]:
def split_dataset(X, node_indices, feature):
    left_indices = []
    right_indices = []
    for index in node_indices:
        if X[index][feature] == 1:
            left_indices.append(index)
        elif X[index][feature] == 0:
            right_indices.append(index)
    return left_indices, right_indices


In [40]:
root_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# Feel free to play around with these variables
# The dataset only has three features, so this value can be 0 (Brown Cap), 1 (Tapering Stalk Shape) or 2 (Solitary)
feature = 0

left_indices, right_indices = split_dataset(X_train, root_indices, feature)

print("CASE 1:")
print("Left indices: ", left_indices)
print("Right indices: ", right_indices)

# Visualize the split 
# generate_split_viz(root_indices, left_indices, right_indices, feature)

print()

# Case 2

root_indices_subset = [0, 2, 4, 6, 8]
left_indices, right_indices = split_dataset(X_train, root_indices_subset, feature)

print("CASE 2:")
print("Left indices: ", left_indices)
print("Right indices: ", right_indices)

# # Visualize the split 
# generate_split_viz(root_indices_subset, left_indices, right_indices, feature)

# UNIT TESTS    
split_dataset_test(split_dataset)

CASE 1:
Left indices:  [0, 1, 2, 3, 4, 7, 9]
Right indices:  [5, 6, 8]

CASE 2:
Left indices:  [0, 2, 4]
Right indices:  [6, 8]
[92m All tests passed.


In [46]:
def compute_information_gain(X, y, node_indices, feature):
    
    # Split dataset
    left_indices, right_indices = split_dataset(X, node_indices, feature)
    
    # Some useful variables
    X_node, y_node = X[node_indices], y[node_indices]
    X_left, y_left = X[left_indices], y[left_indices]
    X_right, y_right = X[right_indices], y[right_indices]
    
    # You need to return the following variables correctly
    information_gain = 0
    
    ### START CODE HERE ###calculate_entropy
    node_entropy = calculate_entropy(y_node)
    
    left_entropy = calculate_entropy(y_left)
    w_left = len(X_left) / len(X_node)
    
    right_entropy = calculate_entropy(y_right)
    w_right = len(X_right) / len(X_node)
    
    information_gain = node_entropy - (w_left * left_entropy + w_right * right_entropy)
    ### END CODE HERE ###  
    
    return information_gain

In [48]:
info_gain0 = compute_information_gain(X_train, y_train, root_indices, feature=0)
print("Information Gain from splitting the root on brown cap: ", info_gain0)

info_gain1 = compute_information_gain(X_train, y_train, root_indices, feature=1)
print("Information Gain from splitting the root on tapering stalk shape: ", info_gain1)

info_gain2 = compute_information_gain(X_train, y_train, root_indices, feature=2)
print("Information Gain from splitting the root on solitary: ", info_gain2)

# UNIT TESTS
compute_information_gain_test(compute_information_gain)

Information Gain from splitting the root on brown cap:  0.034851554559677034
Information Gain from splitting the root on tapering stalk shape:  0.12451124978365313
Information Gain from splitting the root on solitary:  0.2780719051126377
[92m All tests passed.


In [50]:
def get_best_split(X, y, node_indices):   
   
    # Some useful variables
    num_features = X.shape[1]
    
    # You need to return the following variables correctly
    best_feature = -1
    
    ### START CODE HERE ###
    max_information_gain = 0.
    
    for feature in range(num_features):
        information_gain = compute_information_gain(X, y, node_indices, feature)
        if max_information_gain < information_gain:
            max_information_gain = information_gain
            best_feature = feature
            
    ### END CODE HERE ##    
   
    return best_feature

In [52]:
best_feature = get_best_split(X_train, y_train, root_indices)
print("Best feature to split on: %d" % best_feature)

# UNIT TESTS
get_best_split_test(get_best_split)

Best feature to split on: 2
[92m All tests passed.
