In [1]:
import pandas as pd
import numpy as np

##### Sample Dataset

![](img/SampleData.png)

-------------------------
##### Summary of the Data

![](img/summary.png)

#### How Decision Tree works? -- Iterative Dichotomiser 3 (ID3)

##### 1) Creating DataSet

In [12]:
#Lets create a python dictionary to store our attributes and their respected values as follows
dataset = {'Name':['Person 1','Person 2','Person 3','Person 4',
                   'Person5','Person 6','Person 7','Person 8','Person 9','Person 10'],
'Salary':['Low','Med','Med','Med','Med','High','Low','High','Med','Low'],
'Sex':['Male','Male','Male','Female','Male','Female','Female','Male','Female','Male'],
'Marital':['Unmarried','Unmarried','Married','Married','Married','Unmarried'
           ,'Unmarried','Unmarried','Unmarried','Married'],
'Class':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']}

#Now we will create a Data frame out of the preceding dataset(dictionary)
df = pd.DataFrame(dataset)
#df.to_csv("data/samp.csv", index=False)
df

Unnamed: 0,Name,Salary,Sex,Marital,Class
0,Person 1,Low,Male,Unmarried,No
1,Person 2,Med,Male,Unmarried,No
2,Person 3,Med,Male,Married,Yes
3,Person 4,Med,Female,Married,No
4,Person5,Med,Male,Married,Yes
5,Person 6,High,Female,Unmarried,Yes
6,Person 7,Low,Female,Unmarried,No
7,Person 8,High,Male,Unmarried,Yes
8,Person 9,Med,Female,Unmarried,Yes
9,Person 10,Low,Male,Married,Yes


##### 2) Identify the root node

###### Entropy or Impurity

![](img/entropy.PNG)

In [8]:
def getClassEntropy(classAttributes):
    #Get distinct classes and how many time they occure
    _,counts = np.unique(classAttributes,return_counts=True)
    denom = len(classAttributes)
    entropy = 0 #Initialize entropy variable
    #Run a loop to calculate entropy of dataset
    for count in counts:
        fraction = count/denom
        entropy += -fraction*np.log2(fraction) #Equation 2.1
    return entropy

#### Entropy of Class
![](img/entropy1.PNG)

In [58]:
classE = getClassEntropy(df.Class)

#### Entropy of Attributes


In [15]:
###### Salary Variable
pd.crosstab(df.Salary, df.Class)

Class,No,Yes
Salary,Unnamed: 1_level_1,Unnamed: 2_level_1
High,0,2
Low,2,1
Med,2,3


In [25]:
#### Elow 
- 1/3 * np.log2(1/3) -2/3 * np.log2(2/3) 

0.9182958340544896

In [54]:
0.91 * (3/10 ) 

0.273

In [26]:
#### EMedium
- 3/5 * np.log2(3/5) -2/5 * np.log2(2/5) 

0.9709505944546686

In [55]:
0.97 * (5/10)

0.485

In [56]:
.485 + .273

0.758

In [31]:
getClassEntropy(df.Class) - .7624

0.20855059445466861

In [38]:
def getHistTable(df,attribute):
    #This function create a subtable for the given attribute
    #Get values for the attribute
    value = df[attribute]
    #Extract class
    classes = df['Class']
    #Get distinct classes
    classunique = df['Class'].unique()
    #Get distinct values from attribute for example, Low, High and Med for Salary
    valunique = df[attribute].unique()
    #Create an empty table to store attribute value and their respective     class occurance
    temp = np.zeros((len(classunique),len(valunique)),dtype='uint8')
    histTable = pd.DataFrame(temp,index=classunique,columns=valunique)
    #Calculate class occurance for each value for Med salary how many time class attribute is Yes
    for i in range(len(classes)):
        histTable[value[i]][classes[i]]+= 1
    return histTable

In [41]:
a = getHistTable(df, 'Salary')

In [44]:
a

Unnamed: 0,Low,Med,High
No,2,2,0
Yes,1,3,2


In [52]:
attribute = a['Low']

for value in attribute: 
    print(value)

2
1


In [50]:
np.sum(np.sum(a))

10

In [57]:
def getInformationGain(histTable,classEntropy):
    #Initialize a variable for storing probability of Classes
    fraction = 0
    #Calculate total number of instances
    denom = np.sum(np.sum(histTable))
    #Initialize variable for storing total entropies of attrribute values
    EntropyAtt = 0
    #Now we will run a loop to access each attribute and its information gain
    for key in histTable.keys():
        #Extract Attribute
        attribute = histTable[key]
        entropy = 0
        #Initialize variable for entropy calculation
        coeff = 0
        #Initialize variable to store coefficient
        #Find out sum of class attributes(in our case Yes and No)
        denom2 = np.sum(attribute)
        #Run a loop to get entropy of distinct values of attribute
        for value in attribute:
            #Calculate coeff
            coeff+= value/denom
            #Calculate probability of the attribute value
            fraction = value/denom2

            #Calculate Entropy
            eps = np.finfo(float).eps
            entropy+= -fraction*np.log2(fraction+eps)
        EntropyAtt+= coeff*entropy
        
        #Calculate Information Gain using class entropy
    InfGain = classEntropy - EntropyAtt
    return InfGain,EntropyAtt

In [59]:
getInformationGain(a, classE)

(0.20998654701098796, 0.7609640474436806)