In [64]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [78]:
# We are going to find the edible mushroom
# 3 features(Cap color, stalk shape, solitary)

# Cap  color(brown, red), stalk shape(tapering, enlarged), solitary(yes, no)
x = np.array([[1,1,1],[1,0,1],[1,0,0],[1,0,0],[1,1,1],[0,1,1],[0,0,0],[1,0,1],[0,1,0],[1,0,0]])
y = np.array([1,1,0,0,1,0,0,1,1,0])

In [79]:
print(x.shape)

(10, 3)


In [67]:
# split the dataset into train and test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,train_size = 0.7, random_state = 5)
print("x_train shape : ", x_train.shape)
print("x_test shape : ", x_test.shape)

x_train shape :  (7, 3)
x_test shape :  (3, 3)


In [68]:
# print the first 3 elements from x_train
print(x[:3])

[[1 1 1]
 [1 0 1]
 [1 0 0]]


In [69]:
# calculate the entropy of root node (4 edible and 3 non edible)

def entropy(y):
    entropy = 0
    m = len(y)
    if m > 0:
        sum = np.sum(y)
        p1 = sum / m
        
        if 0 < p1 < 1:
            entropy = (-p1) * np.log2(p1) - (1-p1) * np.log2(1-p1)
    else:
        entropy = 0

    return entropy
    
r_entropy = entropy(y_train)
# print("root_entropy : ", r_entropy)

In [70]:
# split the dataset which are all having 1 in left and 0 in right

def split_dataset(x_train, rows, feature):

    left_branch = []
    right_branch = []

    for i in rows:
        if(x_train[i][feature] == 1):
            left_branch.append(i)
        else:
            right_branch.append(i)

    return left_branch, right_branch

feature = 0

rows = []
for i in range(len(y_train)):
    rows.append(i)

print("rows : ", rows)

left, right = split_dataset(x_train, rows, feature)
print("left : ", left)
print("right : ", right)

rows :  [0, 1, 2, 3, 4, 5, 6]
left :  [0, 1, 2, 3, 6]
right :  [4, 5]


In [71]:
# calculate the same for feature 1 and 2

feature = 1
left, right = split_dataset(x_train, rows, feature)
print("left1 : ", left)
print("right1 : ", right, "\n")

feature = 2
left, right = split_dataset(x_train, rows, feature)
print("left2 : ", left)
print("right2 : ", right)

left1 :  [0, 3, 4]
right1 :  [1, 2, 5, 6] 

left2 :  [0, 1, 2, 3]
right2 :  [4, 5, 6]


In [72]:
# just verify all the left and right branches are correct of each feature
print(x_train)

[[1 1 1]
 [1 0 1]
 [1 0 1]
 [1 1 1]
 [0 1 0]
 [0 0 0]
 [1 0 0]]


In [73]:
# formula IG = H(root) - (w(left) * H(left) + w(right) * H(right))

def information_gain(feature, x_train, y_train, rows):    
    left, right = split_dataset(x_train, rows, feature)
    
    w_left = len(left) / len(rows)
    w_right = len(right) / len(rows)

    root_entropy = entropy(y_train)

    # we are using the y_train here because already we got the rows of 1 in left branch
    # so we are taking those rows from x and checking it in y_train
    
    left_entropy = entropy(y_train[left])
    right_entropy = entropy(y_train[right])
    
    ig = root_entropy - (w_left * left_entropy + w_right * right_entropy)
    return ig

for feature in range(x_train.shape[1]):
    ig = information_gain(feature, x_train, y_train, rows)
    print(f"Information Gain for feature {feature}: {ig}")

Information Gain for feature 0: 0.061743357932800724
Information Gain for feature 1: 0.2916919971380596
Information Gain for feature 2: 0.46956521111470695


In [81]:
from xgboost import XGBClassifier

model = XGBClassifier()

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(y_pred)

[1 1 1]
