
<h1>CART Decision Tree Program</h1>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
outlook = 'overcast,overcast,overcast,overcast,rainy,rainy,rainy,rainy,rainy,sunny,sunny,sunny,sunny,sunny'.split(',')
temp = 'hot,cool,mild,hot,mild,cool,cool,mild,mild,hot,hot,mild,cool,mild'.split(',')
humidity = 'high,normal,high,normal,high,normal,normal,normal,high,high,high,high,normal,normal'.split(',')
windy = 'FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE'.split(',')
play = 'yes,yes,yes,yes,yes,yes,no,yes,no,no,no,no,yes,yes'.split(',')

outlook = pd.Series(outlook)
temp = pd.Series(temp)
humidity = pd.Series(humidity)
windy = pd.Series(windy)
play = pd.Series(play)
df = pd.concat([outlook,temp,humidity,windy,play],axis=1,join='outer',keys=['outlook','temp','humidity','windy','play'])

In [4]:
df['outlook'][df['outlook']=='sunny'][df['play']=='no']
df[df['outlook'] == 'sunny']

Unnamed: 0,outlook,temp,humidity,windy,play
9,sunny,hot,high,False,no
10,sunny,hot,high,True,no
11,sunny,mild,high,False,no
12,sunny,cool,normal,False,yes
13,sunny,mild,normal,True,yes


In [5]:
def gini_index(df, attr):
    Class = df.keys()[-1]
    vals = df[Class].unique() 
    attr_vals = df[attr].unique() 
    gini = 0
    valid_vals = []
    for attribute in attr_vals:
        for val in vals:
            yes_val = df[attr][df[attr]==attribute][df[Class]==val]
            x = len(yes_val)
            valid_vals.append([x])
    gini_fin = 0
    den = len(df[Class])
    for i in range(0,len(valid_vals)-1,2):
        yes = valid_vals[i][0]
        no = valid_vals[i+1][0]
        tot = yes + no
        gini = (yes/tot)**2 + (no/tot)**2
        gini_fin += (tot/den)*(1 - gini)
    return gini_fin

In [6]:
list_ginis = [gini_index(df,i) for i in df.columns[:-1]]
list_ginis

[0.34285714285714286,
 0.44047619047619047,
 0.3673469387755103,
 0.42857142857142855]

In [7]:
def gini_root(df):
    Class = df.keys()[-1]
    vals = df[Class].unique()
    gini = 0
    for value in vals:
        count = len(df[df[Class]==value])
        tot = len(df[Class])
        gini += (count/tot)**2
    gini = 1 - gini
    return gini

In [8]:
gini_root(df)

0.4591836734693877

In [9]:
def find_winner(df):
    ig = []
    for key in df.keys()[:-1]:
        ig.append(gini_root(df) - gini_index(df,key))
    return df.keys()[:-1][np.argmax(ig)]

In [10]:
print(find_winner(df))

outlook


In [11]:
def get_subtable(df,node,value):
    return df[df[node]==value].reset_index(drop=True)

In [12]:
def build_tree(df,tree=None):
    Class = df.keys()[-1]
    node = find_winner(df)
    
    att_val = np.unique(df[node])
    if tree is None:
        tree = {}
        tree[node] = {}
        for val in att_val:
            subtable = get_subtable(df,node,val)
            clValue,counts = np.unique(subtable['play'],return_counts=True)
            if len(counts)==1:
                tree[node][val] = clValue[0]
            else:
                tree[node][val] = build_tree(subtable)
    return tree

In [13]:
t = build_tree(df)
import pprint
pprint.pprint(t)

{'outlook': {'overcast': 'yes',
             'rainy': {'windy': {'FALSE': 'yes', 'TRUE': 'no'}},
             'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}
