In [1]:
import pandas as pd  
import csv 

In [2]:
train_data = pd.read_csv("transport.csv")

In [3]:
cols = train_data.shape[1]
rows = train_data.shape[0]
print("Rows:", rows)
print("Columns:", cols)

Rows: 10
Columns: 5


In [4]:
train_data.head()

Unnamed: 0,Gender,Car_Ownership,Travel_Cost,Income_Level,Transportation
0,Male,0,Cheap,Low,Bus
1,Male,1,Cheap,Medium,Bus
2,Female,1,Cheap,Medium,Train
3,Female,0,Cheap,Low,Bus
4,Male,1,Cheap,Medium,Bus


In [5]:
print("Training Data:\n", train_data)

Training Data:
    Gender  Car_Ownership Travel_Cost Income_Level Transportation
0    Male              0       Cheap          Low            Bus
1    Male              1       Cheap       Medium            Bus
2  Female              1       Cheap       Medium          Train
3  Female              0       Cheap          Low            Bus
4    Male              1       Cheap       Medium            Bus
5    Male              0    Standard       Medium          Train
6  Female              1    Standard       Medium          Train
7  Female              1   Expensive         High            Car
8    Male              2   Expensive       Medium            Car
9  Female              2   Expensive         High            Car


In [6]:
def entropy(probs):  
    import math
    return sum( [-prob*math.log(prob, 2) for prob in probs] )
def entropy_of_list(a_list):  
    from collections import Counter
    cnt = Counter(x for x in a_list)   
    print("\nClasses:",cnt)
    num_instances = len(a_list) 
    print("\n Number of Instances of the Current Sub Class is {0}:".format(num_instances ))
    probs = [x / num_instances for x in cnt.values()]
    print(probs)
    print("\n Classes:",list(cnt.keys()))
    print(" \n Probabilities of Class {0} is {1}:".format(min(cnt),min(probs)))
    print(" \n Probabilities of Class {0} is {1}:".format(max(cnt),max(probs)))
    print(" \n Probabilities of Class {0} is {1}:".format(max(cnt),max(probs)))
    return entropy(probs) 
print("\n  INPUT DATA SET FOR ENTROPY CALCULATION:\n", train_data['Transportation'])
total_entropy = entropy_of_list(train_data['Transportation'])
print("\n Total Entropy of Play Tennis Data Set:",total_entropy)


  INPUT DATA SET FOR ENTROPY CALCULATION:
 0      Bus
1      Bus
2    Train
3      Bus
4      Bus
5    Train
6    Train
7      Car
8      Car
9      Car
Name: Transportation, dtype: object

Classes: Counter({'Bus': 4, 'Train': 3, 'Car': 3})

 Number of Instances of the Current Sub Class is 10:
[0.4, 0.3, 0.3]

 Classes: ['Bus', 'Train', 'Car']
 
 Probabilities of Class Bus is 0.3:
 
 Probabilities of Class Train is 0.4:
 
 Probabilities of Class Train is 0.4:

 Total Entropy of Play Tennis Data Set: 1.5709505944546684


In [7]:
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    print("Information Gain Calculation of ",split_attribute_name)
    df_split = df.groupby(split_attribute_name)
    print("split:",type(df_split))
    for name,group in df_split:
        print("Name:\n",name)
        print("Group:\n",group)
    nobs = len(df.index) 
    print("NOBS",nobs) 
    df_ent_prob = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    print(df_ent_prob.columns)
    print("the entropy and the probability value for each attribute is",df_ent_prob)
    df_ent_prob.columns = ['Entropy', 'PropObservations']
    new_entropy = sum( df_ent_prob['Entropy'] * df_ent_prob['PropObservations'] )
    overall_entropy = entropy_of_list(df[target_attribute_name])
    return overall_entropy - new_entropy


print('Info-gain for Gender is :'+str( information_gain(train_data, 'Gender', 'Transportation')),"\n")
print('\n Info-gain for Car_Ownership is: ' + str( information_gain(train_data, 'Car_Ownership', 'Transportation')),"\n")
print('\n Info-gain for Travel_Cost is:' + str( information_gain(train_data, 'Travel_Cost', 'Transportation')),"\n")
print('\n Info-gain for Income_Level is:' + str( information_gain(train_data, 'Income_Level','Transportation')),"\n")

Information Gain Calculation of  Gender
split: <class 'pandas.core.groupby.generic.DataFrameGroupBy'>
Name:
 Female
Group:
    Gender  Car_Ownership Travel_Cost Income_Level Transportation
2  Female              1       Cheap       Medium          Train
3  Female              0       Cheap          Low            Bus
6  Female              1    Standard       Medium          Train
7  Female              1   Expensive         High            Car
9  Female              2   Expensive         High            Car
Name:
 Male
Group:
   Gender  Car_Ownership Travel_Cost Income_Level Transportation
0   Male              0       Cheap          Low            Bus
1   Male              1       Cheap       Medium            Bus
4   Male              1       Cheap       Medium            Bus
5   Male              0    Standard       Medium          Train
8   Male              2   Expensive       Medium            Car
NOBS 10

Classes: Counter({'Train': 2, 'Car': 2, 'Bus': 1})

 Number of Instances 

In [8]:
def id3(df, target_attribute_name, attribute_names, default_class=None):
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    if len(cnt) == 1:
        return next(iter(cnt)) 
    elif df.empty or (not attribute_names):
        return default_class
    else:
        default_class = max(cnt.keys())
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] 
        index_of_max = gainz.index(max(gainz)) 
        best_attr = attribute_names[index_of_max]
        tree = {best_attr:{}} 
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class)
            tree[best_attr][attr_val] = subtree
        return tree

In [9]:
attribute_names = list(train_data.columns)
print("List of Attributes:", attribute_names) 
attribute_names.remove('Transportation') 
print("Predicting Attributes:", attribute_names)

List of Attributes: ['Gender', 'Car_Ownership', 'Travel_Cost', 'Income_Level', 'Transportation']
Predicting Attributes: ['Gender', 'Car_Ownership', 'Travel_Cost', 'Income_Level']


In [10]:
from pprint import pprint
tree = id3(train_data,'Transportation',attribute_names)
print("\n\nThe Resultant Decision Tree is :\n")
pprint(tree)

Information Gain Calculation of  Gender
split: <class 'pandas.core.groupby.generic.DataFrameGroupBy'>
Name:
 Female
Group:
    Gender  Car_Ownership Travel_Cost Income_Level Transportation
2  Female              1       Cheap       Medium          Train
3  Female              0       Cheap          Low            Bus
6  Female              1    Standard       Medium          Train
7  Female              1   Expensive         High            Car
9  Female              2   Expensive         High            Car
Name:
 Male
Group:
   Gender  Car_Ownership Travel_Cost Income_Level Transportation
0   Male              0       Cheap          Low            Bus
1   Male              1       Cheap       Medium            Bus
4   Male              1       Cheap       Medium            Bus
5   Male              0    Standard       Medium          Train
8   Male              2   Expensive       Medium            Car
NOBS 10

Classes: Counter({'Train': 2, 'Car': 2, 'Bus': 1})

 Number of Instances 

In [11]:
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)
print("Tree Keys:\n",tree[attribute].keys())

Best Attribute :
 Travel_Cost
Tree Keys:
 dict_keys(['Cheap', 'Expensive', 'Standard'])
