In [1]:
import pandas as pd
from collections import Counter
import math

In [2]:
df_buy = pd.read_csv('buy_computer.csv')
df_buy

Unnamed: 0,Age,Income,Student,Credit_rating,Buy_computer
0,<= 30,high,no,fair,no
1,<= 30,high,no,excellent,no
2,31…40,high,no,fair,yes
3,> 40,medium,no,fair,yes
4,> 40,low,yes,fair,yes
5,> 40,low,yes,excellent,no
6,31…40,low,yes,excellent,yes
7,<= 30,medium,no,fair,no
8,<= 30,low,yes,fair,yes
9,> 40,medium,yes,fair,yes


In [3]:
target_attribute = 'Buy_computer'
attribute_names = list(df_buy.columns)
attribute_names.remove(target_attribute) #Remove the class attribute
attribute_names

['Age', 'Income', 'Student', 'Credit_rating']

In [4]:
def entropy_list(a_list):
  count = Counter(x for x in a_list)
  denominator = len(a_list)*1.0
  probs = [x/denominator for x in count.values()]
  return sum([-prob*math.log(prob,2) for prob in probs])

In [5]:
def info_gain(df,split,target):
  df_split = df.groupby(split)
  denominator = len(df.index)*1.0
  df_agg = df_split.agg({target:[entropy_list, lambda x: len(x)/denominator]})
  new_entropy = sum(df_agg.iloc[:,0]*df_agg.iloc[:,1])
  old_entropy = entropy_list(df[target])
  return old_entropy - new_entropy

In [6]:
def id3(df,target,attribute_name,default_class = None):
  count = Counter(x for x in df[target])
  if len(count)==1:
    return next(iter(count))
  elif df.empty or (not attribute_name):
    return default_class
  else:
    default_class = max(count.keys())
    gains = [info_gain(df,attr,target) for attr in attribute_name]
    index_max = gains.index(max(gains))
    best_attr = attribute_name[index_max]
    tree = { best_attr:{ } }
    remaining_attr = [x for x in attribute_name if x!=best_attr]
    for attr_val, data_subset in df.groupby(best_attr):
      subtree = id3(data_subset,target,remaining_attr,default_class)
      tree[best_attr][attr_val] = subtree
    return tree

In [7]:
tree = id3(df_buy,target_attribute,attribute_names)
print("\nThe Resultant Decision Tree is :\n")
print(tree)


The Resultant Decision Tree is :

{'Age': {'31…40': 'yes', '<= 30': {'Student': {'no': 'no', 'yes': 'yes'}}, '> 40': {'Credit_rating': {'excellent': 'no', 'fair': 'yes'}}}}


In [8]:
train_data = df_buy.iloc[1:-4] 
test_data = df_buy.iloc[-4:] 
train_tree = id3(train_data, target_attribute, attribute_names)
print("\nThe Resultant Decision train_tree is :\n")
print(train_tree)


The Resultant Decision train_tree is :

{'Age': {'31…40': 'yes', '<= 30': {'Income': {'high': 'no', 'low': 'yes', 'medium': 'no'}}, '> 40': {'Credit_rating': {'excellent': 'no', 'fair': 'yes'}}}}


In [9]:
def classify(record,tree):
  attr = next(iter(tree))
  result = tree[attr][record[attr]]
  if isinstance(result,dict):
    return classify(record,result)
  else:
    return result

In [10]:
start = test_data.index.start
stop = test_data.index.stop
test=[classify(test_data.loc[i],train_tree) for i in range(start,stop)]
print(test)

['no', 'yes', 'yes', 'no']


In [11]:
num = 0
for i in range(0,len(test)):
  if test[i] == test_data.loc[start+i][4]:
    num += 1
per = num*1.0/(len(test))
print(per)

0.75
