In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('PlayTennis.csv')
data

Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayTennis
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Sunny,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


In [3]:
import csv

In [4]:
def load_csv(filename):
    lines=csv.reader(open(filename,"r"));
    dataset = list(lines)
    headers = dataset.pop(0)
    return dataset,headers

dataset, features = load_csv('PlayTennis.csv')
dataset, features

([['D1', 'Sunny', 'Hot', 'High', 'Weak', 'No'],
  ['D2', 'Sunny', 'Hot', 'High', 'Strong', 'No'],
  ['D3', 'Overcast', 'Hot', 'High', 'Weak', 'Yes'],
  ['D4', 'Rain', 'Mild', 'High', 'Weak', 'Yes'],
  ['D5', 'Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
  ['D6', 'Rain', 'Cool', 'Normal', 'Strong', 'No'],
  ['D7', 'Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
  ['D8', 'Sunny', 'Mild', 'High', 'Weak', 'No'],
  ['D9', 'Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
  ['D10', 'Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
  ['D11', 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
  ['D12', 'Overcast', 'Mild', 'High', 'Strong', 'Yes'],
  ['D13', 'Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
  ['D14', 'Rain', 'Mild', 'High', 'Strong', 'No']],
 ['Day', 'Outlook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis'])

In [5]:
class Node:
    def __init__(self,attribute):
        self.attribute=attribute
        self.children=[]
        self.answer=""

In [6]:
def subtables(data,col,delete):
    dic={}
    coldata=[row[col] for row in data]
    attr=list(set(coldata))
    counts=[0]*len(attr)
    r=len(data)
    c=len(data[0])
    for x in range(len(attr)):
        for y in range(r):
            if data[y][col]==attr[x]:
                counts[x]+=1
    for x in range(len(attr)):
        dic[attr[x]]=[[0 for i in range(c)] for j in range(counts[x])]
        pos=0
        for y in range(r):
            if data[y][col]==attr[x]:
                if delete:
                    del data[y][col]
                dic[attr[x]][pos]=data[y]
                pos+=1
    return attr,dic

In [7]:
import math
def entropy(S):
    attr=list(set(S))
    if len(attr)==1:
        return 0
    counts=[0,0]
    for i in range(2):
        counts[i]=sum([1 for x in S if attr[i]==x])/(len(S)*1.0)
    sums=0
    for cnt in counts:
        sums+=-1*cnt*math.log(cnt,2)
    return sums

In [8]:
def compute_gain(data,col):
    attr,dic = subtables(data,col,delete=False)
    total_size=len(data)
    entropies=[0]*len(attr)
    ratio=[0]*len(attr)
    total_entropy=entropy([row[-1] for row in data])
    for x in range(len(attr)):
        ratio[x]=len(dic[attr[x]])/(total_size*1.0)
        entropies[x]=entropy([row[-1] for row in dic[attr[x]]])
        total_entropy-=ratio[x]*entropies[x]
    return total_entropy

In [9]:
def build_tree(data,features):
    lastcol=[row[-1] for row in data]
    if(len(set(lastcol)))==1:
        node=Node("")
        node.answer=lastcol[0]
        return node
    n=len(data[0])-1
    gains=[0]*n
    for col in range(n):
        gains[col]=compute_gain(data,col)
    split=gains.index(max(gains))
    node=Node(features[split])
    fea = features[:split]+features[split+1:]

    attr,dic=subtables(data,split,delete=True)
    for x in range(len(attr)):
        child=build_tree(dic[attr[x]],fea)
        node.children.append((attr[x],child))
    return node

In [10]:
def print_tree(node,level):
    if node.answer!="":
        print(" "*level,node.answer)
        return
    print(" "*level,node.attribute)
    for value,n in node.children:
        print(" "*(level+1),value)
        print_tree(n,level+2)

In [11]:
def classify(node,x_test,features):
    if node.answer!="":
        print(node.answer)
        return
    pos=features.index(node.attribute)
    for value, n in node.children:
        if x_test[pos]==value:
            classify(n,x_test,features)

In [12]:
'''Main program'''
dataset,features=load_csv("PlayTennis.csv")
features = features[1:]

dataset = [ele[1:] for ele in dataset]
print("\n Features: ", features)
print("\n Dataset: ", dataset)
node1=build_tree(dataset,features)
print("\n The decision tree for the dataset using ID3 algorithm is")
print_tree(node1,0)
testdata,features=load_csv("testdata.csv")
for xtest in testdata:
    print("\n The test instance:",xtest)
    print("\n The label for test instance:",end=" ")
    classify(node1,xtest,features)


 Features:  ['Outlook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis']

 Dataset:  [['Sunny', 'Hot', 'High', 'Weak', 'No'], ['Sunny', 'Hot', 'High', 'Strong', 'No'], ['Overcast', 'Hot', 'High', 'Weak', 'Yes'], ['Rain', 'Mild', 'High', 'Weak', 'Yes'], ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'], ['Rain', 'Cool', 'Normal', 'Strong', 'No'], ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'], ['Sunny', 'Mild', 'High', 'Weak', 'No'], ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'], ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'], ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'], ['Overcast', 'Mild', 'High', 'Strong', 'Yes'], ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'], ['Rain', 'Mild', 'High', 'Strong', 'No']]

 The decision tree for the dataset using ID3 algorithm is
 Outlook
  Overcast
   Yes
  Rain
   Wind
    Strong
     No
    Weak
     Yes
  Sunny
   Humidity
    High
     No
    Normal
     Yes

 The test instance: ['T1', 'Rain', 'Cool', 'Normal', 'Strong']

 The label for test instance: No

 The te