# Entropy

In [1]:
import numpy as np

In [2]:
Y = np.array([1,0,0,1,0,1,0,1])
X = np.array([1,1,1,1,1,1,1])

In [3]:
def entropy(var):
    values , count = np.unique(var, return_counts=True)
    n = var.shape[0]
    
    ent = 0.0
    for i in count:
        p = i/n
        ent += (p * np.log2(p))
        
    return -ent

In [4]:
entropy(Y)

1.0

In [5]:
entropy(X)

-0.0

# Split Data

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('golf.csv')

In [8]:
df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [9]:
def divide_data(data, feature):
    DATA = {}
    
    feat_values = list(data[feature].value_counts().index)
    occurence = list(data[feature].value_counts())
    
    for val in feat_values:
        DATA[val] = {'data': pd.DataFrame(columns=data.columns), 'len': 0}
        
    for ix in range(data.shape[0]):
        val = data[feature].iloc[ix]
        
        DATA[val]['data'] = pd.concat([DATA[val]['data'], data.iloc[[ix]]], ignore_index=True)
        
        idx = feat_values.index(val)
        DATA[val]['len'] = occurence[idx]
    
    return DATA


In [13]:
divide_data(df,'Outlook')

{'sunny': {'data':   Outlook Temperature Humidity  Windy Play
  0   sunny         hot     high  False   no
  1   sunny         hot     high   True   no
  2   sunny        mild     high  False   no
  3   sunny        cool   normal  False  yes
  4   sunny        mild   normal   True  yes,
  'len': 5},
 'rainy': {'data':   Outlook Temperature Humidity  Windy Play
  0   rainy        mild     high  False  yes
  1   rainy        cool   normal  False  yes
  2   rainy        cool   normal   True   no
  3   rainy        mild   normal  False  yes
  4   rainy        mild     high   True   no,
  'len': 5},
 'overcast': {'data':     Outlook Temperature Humidity  Windy Play
  0  overcast         hot     high  False  yes
  1  overcast        cool   normal   True  yes
  2  overcast        mild     high   True  yes
  3  overcast         hot   normal  False  yes,
  'len': 4}}

In [14]:
for i,j in df.groupby('Outlook'):
    print(i)
    print(j)
    
    print("---")

overcast
     Outlook Temperature Humidity  Windy Play
2   overcast         hot     high  False  yes
6   overcast        cool   normal   True  yes
11  overcast        mild     high   True  yes
12  overcast         hot   normal  False  yes
---
rainy
   Outlook Temperature Humidity  Windy Play
3    rainy        mild     high  False  yes
4    rainy        cool   normal  False  yes
5    rainy        cool   normal   True   no
9    rainy        mild   normal  False  yes
13   rainy        mild     high   True   no
---
sunny
   Outlook Temperature Humidity  Windy Play
0    sunny         hot     high  False   no
1    sunny         hot     high   True   no
7    sunny        mild     high  False   no
8    sunny        cool   normal  False  yes
10   sunny        mild   normal   True  yes
---


# Information Gain 

In [15]:
def information_gain(data, feature):
    examples = data.shape[0]
    
    DATA = divide_data(data, feature)
    
    keys = DATA.keys()
    
    
    ent_of_children = 0.0
    
    for key in keys:
        ent_of_children +=  ( (DATA[key]['len']/examples) * entropy(DATA[key]['data']['Play']) )
        
    info_gain = entropy(data['Play']) - ent_of_children
    return info_gain

In [16]:
information_gain(df, 'Outlook')

0.24674981977443933

In [17]:
information_gain(df, 'Windy')

0.04812703040826949

In [18]:
information_gain(df, 'Temperature')

0.02922256565895487

In [19]:
information_gain(df, 'Humidity')

0.15183550136234159

# Decision Tree Implementation

In [20]:
class DecisionTree:
    
    def __init__(self, depth=0, max_depth=5):
        self.children = {}
        self.fkey = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    def train(self, data):
        features = ['Outlook', 'Temperature', 'Humidity', 'Windy']
        
        info_gains = []
        
        for f in features:
            i_gain = information_gain(data, f)
            info_gains.append(i_gain)
            
        self.fkey = features[np.argmax(info_gains)]
        
        DATA = divide_data(data, self.fkey)
        
        
        labels = list(data['Play'].value_counts().index)
        freq = list(data['Play'].value_counts().values)
        
        self.target = labels[np.argmax(freq)]
        
        
    
        
        
        have_data = 0
        keys = DATA.keys()
        
        for key in keys:
            if DATA[key]['len'] > 0:
                have_data +=1
        
        if have_data<2:
            return 
        
    
        if(self.depth >= self.max_depth):
            return
        
        
        print("\t"*self.depth + "Making tree with - ", self.fkey)
        
        
        for key in keys:
            new_data = DATA[key]['data']
            self.children[key] = DecisionTree(depth = self.depth + 1)
            self.children[key].train(new_data)
            
        return  
    
    
    def predict(self, test):
        if self.children == {}:
            return self.target
        return self.children[test[self.fkey][0]].predict(test)


In [21]:
model = DecisionTree()

In [22]:
model.train(df)

Making tree with -  Outlook
	Making tree with -  Humidity
	Making tree with -  Windy


In [23]:
model.target

'yes'

In [24]:
model.fkey

'Outlook'

In [25]:
model.children

{'sunny': <__main__.DecisionTree at 0x7f7ae8341950>,
 'rainy': <__main__.DecisionTree at 0x7f7ad2123790>,
 'overcast': <__main__.DecisionTree at 0x7f7ad212f6d0>}

In [26]:
model.children['sunny']

<__main__.DecisionTree at 0x7f7ae8341950>

In [27]:
model.children['sunny'].fkey

'Humidity'

In [28]:
model.children['sunny'].children

{'high': <__main__.DecisionTree at 0x7f7ad2119b50>,
 'normal': <__main__.DecisionTree at 0x7f7ad212ef10>}

In [29]:
model.children['sunny'].children['high'].children

{}

In [30]:
x_test = pd.DataFrame([['sunny', 'hot', 'normal', False]], columns=list(df.columns.values[:-1]))

In [31]:
x_test

Unnamed: 0,Outlook,Temperature,Humidity,Windy
0,sunny,hot,normal,False


In [32]:
model.predict(x_test)

'yes'