In [1]:
# Load packages
import pandas as pd
import numpy as np
import math

In [2]:
df = pd.DataFrame({"Refund" : ["Yes","No","No","Yes","No","No","Yes","No","No","No"],
                   "MartialStatus" : ["Single","Married","Single","Married","Divorced","Married","Divorced","Single","Married","Single"],
                   "Income" : [125,100,70,120,95,60,220,85,75,90],
                   "Cheat" : ["No","No","No","No","Yes","No","No","Yes","No","Yes"]})
df = df[["Refund","MartialStatus","Income","Cheat"]]

df

Unnamed: 0,Refund,MartialStatus,Income,Cheat
0,Yes,Single,125,No
1,No,Married,100,No
2,No,Single,70,No
3,Yes,Married,120,No
4,No,Divorced,95,Yes
5,No,Married,60,No
6,Yes,Divorced,220,No
7,No,Single,85,Yes
8,No,Married,75,No
9,No,Single,90,Yes


# Question: Use Gini index to choose a best feature and a best splitting point as the root of the decision tree. You do not need to build the whole tree!!

# You only need to consider binary split.

### For "Refund" feature, two children are Yes and No.
### For "MartialStatus" feature, two children could be Married and Not Married, Divorced and Not Divorced, or Single and Not Single.
### For "Income" feature, please refer the picture in slide 40 to choose one splitting point. For example, two children are "> 80" and "< 80" if the splitting point is 80.  

In [3]:
data = df.iloc[:,:-1].as_matrix()
target = df.iloc[:,-1].as_matrix()
maxgain={}

# step 1: How to calculate Gini index for one node? This node could be either a parent or a child.
def gini_node(node):
    # node: a list of classes of all instances at current node. For example, node = [yes, yes, no, no, yes]
    if len(node)==0:
        return 0
    
    classes = list(set(node))
    gini = 1
    for i in range(len(classes)):
        gini -= ((node==classes[i]).sum() / float(len(node)))**2
    return gini

# step 2: How to combine Gini index of two children?
def gini_children(leftnode, rightnode):
    leftgini = gini_node(leftnode)
    rightgini = gini_node(rightnode)
    leftnum = len(leftnode)
    rightnum = len (rightnode)
    allnum = float(leftnum + rightnum)
    gini = leftnum/allnum * leftgini + rightnum/allnum * rightgini
    return gini

# step 3: How to calculate value of Gain?
def gain(parent, leftnode, rightnode):
    return gini_node(parent)-gini_children(leftnode, rightnode)


# step 4: What is the value of gain if choosing "Refund"?
parent = target
leftnode = target[data[:,0] == "Yes"]
rightnode = target[data[:,0] == "No"]
refundgain = gain(parent, leftnode, rightnode)
maxgain["refund"]=refundgain

# step 5: What is the value of gain if choosing "MartialStatus"? Which splitting method is the best one?
martialstatus = list(set(data[:,1]))
parent = target
martialgain = []
for i in range(len(martialstatus)):
    leftnode = target[data[:,1] == martialstatus[i]]
    rightnode = target[data[:,1] != martialstatus[i]]
    martialgain.append(gain(parent, leftnode, rightnode))
maxgain[martialstatus[martialgain.index(max(martialgain))]]=max(martialgain)

# step 6: What is the value of gain if choosing "Income"? Which splitting point is the best one?
income = np.array(data[:,2],dtype=float)
income_sorted = sorted(income)
splitpoints = [(income_sorted[i]+income_sorted[i+1])/2 for i in range(len(income_sorted)-1)]
splitpoints.insert(0,income_sorted[0]-10)
splitpoints.append(income_sorted[-1]+10)

parent = target
incomegain = []
for i in range(len(splitpoints)):
    leftnode = target[income <= splitpoints[i]]
    rightnode = target[income > splitpoints[i]]
    incomegain.append(gain(parent, leftnode, rightnode))
maxgain[splitpoints[incomegain.index(max(incomegain))]]=max(incomegain)

# step 7: Choose a best feature and splitting method.
print maxgain

{'refund': 0.077142857142857235, 97.5: 0.12000000000000011, 'Married': 0.12000000000000011}
