In [2]:
import pandas as pd
import math
from collections import Counter, defaultdict
from functools import partial
train = pd.read_csv('data/gender_classification.csv')
train.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F
3,Warm,Folk/Traditional,Whiskey,Fanta,F
4,Cool,Rock,Vodka,Coca Cola/Pepsi,F


In [3]:
def entropy(class_probabilities):
    return sum(-p * math.log(p, 2) for p in class_probabilities if p)

In [4]:
def class_probabilities(labels):
    total_count = len(labels)
    return [count / total_count for count in Counter(labels).values()]

In [5]:
def data_entropy(labeled_data):
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

In [6]:
def partition_entropy(subsets):
    total_count = sum(len(subset) for subset in subsets)
    return sum(data_entropy(subset) * len(subset) / total_count for subset in subsets)

In [7]:
def partition_by(inputs, attribute):
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute]
        groups[key].append(input)
    return groups

In [8]:
def partition_entropy_by(inputs, attribute):
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values())

In [9]:
inputs = []
for row in train.values:
    input = {}
    input['color'] = row[0]
    input['music'] = row[1]
    input['beverage'] = row[2]
    input['drink'] = row[3]
    label = False if row[4] == 'F' else True
    inputs.append((input, label))
print(inputs)

[({'color': 'Cool', 'drink': '7UP/Sprite', 'music': 'Rock', 'beverage': 'Vodka'}, False), ({'color': 'Neutral', 'drink': 'Coca Cola/Pepsi', 'music': 'Hip hop', 'beverage': 'Vodka'}, False), ({'color': 'Warm', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'beverage': 'Wine'}, False), ({'color': 'Warm', 'drink': 'Fanta', 'music': 'Folk/Traditional', 'beverage': 'Whiskey'}, False), ({'color': 'Cool', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'beverage': 'Vodka'}, False), ({'color': 'Warm', 'drink': 'Fanta', 'music': 'Jazz/Blues', 'beverage': "Doesn't drink"}, False), ({'color': 'Cool', 'drink': 'Coca Cola/Pepsi', 'music': 'Pop', 'beverage': 'Beer'}, False), ({'color': 'Warm', 'drink': 'Fanta', 'music': 'Pop', 'beverage': 'Whiskey'}, False), ({'color': 'Warm', 'drink': '7UP/Sprite', 'music': 'Rock', 'beverage': 'Other'}, False), ({'color': 'Neutral', 'drink': 'Coca Cola/Pepsi', 'music': 'Pop', 'beverage': 'Wine'}, False), ({'color': 'Cool', 'drink': '7UP/Sprite', 'music': 'Pop', 'beverag

In [10]:
for key in ['color', 'music', 'beverage', 'drink']:
    print(key, partition_entropy_by(inputs, key))

color 0.9877787203441428
music 0.8475272242792208
beverage 0.9705999798596151
drink 0.9733467653099966


In [11]:
music_inputs = [(input, label) for input, label in inputs if input['music'] == "Rock"]
print(music_inputs)

[({'color': 'Cool', 'drink': '7UP/Sprite', 'music': 'Rock', 'beverage': 'Vodka'}, False), ({'color': 'Warm', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'beverage': 'Wine'}, False), ({'color': 'Cool', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'beverage': 'Vodka'}, False), ({'color': 'Warm', 'drink': '7UP/Sprite', 'music': 'Rock', 'beverage': 'Other'}, False), ({'color': 'Cool', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'beverage': 'Beer'}, False), ({'color': 'Cool', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'beverage': 'Other'}, False), ({'color': 'Cool', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'beverage': 'Other'}, False), ({'color': 'Cool', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'beverage': 'Vodka'}, False), ({'color': 'Warm', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'beverage': 'Other'}, False), ({'color': 'Neutral', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'beverage': 'Beer'}, False), ({'color': 'Neutral', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'b

In [12]:
for key in ['color', 'beverage', 'drink']:
    print(key, partition_entropy_by(music_inputs, key))

color 0.9923554195933337
beverage 0.6057776318066809
drink 0.8682145190209152


In [13]:
def classify(tree, input):
    if tree in [True, False]:
        return tree
    
    attribute, subtree_dict = tree
    
    subtree_key = input.get(attribute)
    
    if subtree_key not in subtree_dict:
        subtree_key = None
        
    subtree = subtree_dict[subtree_key]
    return classify(subtree, input)

In [14]:
def build_tree_id3(inputs, split_people=None):
    if split_people is None:
        split_people = inputs[0][0].keys()
    
    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues
    
    if num_trues == 0: return False
    if num_falses == 0: return True
    
    if not split_people:
        return num_trues >= num_falses
    
    best_attribute = min(split_people, key=partial(partition_entropy_by, inputs))
    partitions = partition_by(inputs, best_attribute)
    new_people = [a for a in split_people if a != best_attribute]
    
    subtrees = { attribute_value: build_tree_id3(subset, new_people)
               for attribute_value, subset in partitions.items() }
    
    subtrees[None] = num_trues > num_falses
    return (best_attribute, subtrees)

In [15]:
tree = build_tree_id3(inputs)
print(tree)

('music', {'R&B and soul': ('beverage', {None: True, "Doesn't drink": True, 'Wine': True, 'Whiskey': ('color', {None: False, 'Cool': True, 'Warm': False}), 'Beer': False}), None: False, 'Folk/Traditional': ('beverage', {None: False, 'Whiskey': False, 'Other': True, 'Beer': True}), 'Jazz/Blues': ('beverage', {'Wine': False, "Doesn't drink": False, 'Vodka': True, 'Whiskey': False, None: False}), 'Rock': ('beverage', {"Doesn't drink": True, None: False, 'Wine': ('color', {None: True, 'Cool': True, 'Warm': False}), 'Vodka': ('color', {None: False, 'Cool': ('drink', {None: False, '7UP/Sprite': False, 'Coca Cola/Pepsi': True}), 'Warm': True}), 'Other': False, 'Beer': ('color', {None: False, 'Cool': False, 'Neutral': False, 'Warm': True})}), 'Hip hop': ('beverage', {'Vodka': False, None: True, "Doesn't drink": True, 'Wine': True, 'Beer': True}), 'Pop': ('beverage', {"Doesn't drink": ('color', {None: False, 'Cool': False, 'Neutral': True}), None: False, 'Wine': False, 'Whiskey': ('color', {Non

In [16]:
classify(tree, {"color": "Cool", "music": "Hip hop", "beverage": "Beer", "drink": "7UP/Sprite" })

True

In [34]:
accuracy = sum(1 for input, label in inputs 
              if classify(tree, input) == label) / len(inputs) * 100

print('accuracy: {}%'.format(accuracy))
    

accuracy: 95.45454545454545%
