In [94]:
import pandas as pd
import math
from collections import Counter, defaultdict
from functools import partial
train = pd.read_csv('data/gender_classification.csv')
train.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F
3,Warm,Folk/Traditional,Whiskey,Fanta,F
4,Cool,Rock,Vodka,Coca Cola/Pepsi,F


In [63]:
def entropy(class_probabilities):
    return sum(-p * math.log(p, 2) for p in class_probabilities if p)

In [73]:
def class_probabilities(labels):
    total_count = len(labels)
    return [count / total_count for count in Counter(labels).values()]

In [75]:
def data_entropy(labeled_data):
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

In [66]:
def partition_entropy(subsets):
    total_count = sum(len(subset) for subset in subsets)
    return sum(data_entropy(subset) * len(subset) / total_count for subset in subsets)

In [67]:
def partition_by(inputs, attribute):
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute]
        groups[key].append(input)
    return groups

In [68]:
def partition_entropy_by(inputs, attribute):
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values())

In [69]:
inputs = []
for row in train.values:
    input = {}
    input['color'] = row[0]
    input['music'] = row[1]
    input['beverage'] = row[2]
    input['drink'] = row[3]
    label = False if row[4] == 'F' else True
    inputs.append((input, label))
print(inputs)

[({'beverage': 'Vodka', 'drink': '7UP/Sprite', 'music': 'Rock', 'color': 'Cool'}, False), ({'beverage': 'Vodka', 'drink': 'Coca Cola/Pepsi', 'music': 'Hip hop', 'color': 'Neutral'}, False), ({'beverage': 'Wine', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'color': 'Warm'}, False), ({'beverage': 'Whiskey', 'drink': 'Fanta', 'music': 'Folk/Traditional', 'color': 'Warm'}, False), ({'beverage': 'Vodka', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'color': 'Cool'}, False), ({'beverage': "Doesn't drink", 'drink': 'Fanta', 'music': 'Jazz/Blues', 'color': 'Warm'}, False), ({'beverage': 'Beer', 'drink': 'Coca Cola/Pepsi', 'music': 'Pop', 'color': 'Cool'}, False), ({'beverage': 'Whiskey', 'drink': 'Fanta', 'music': 'Pop', 'color': 'Warm'}, False), ({'beverage': 'Other', 'drink': '7UP/Sprite', 'music': 'Rock', 'color': 'Warm'}, False), ({'beverage': 'Wine', 'drink': 'Coca Cola/Pepsi', 'music': 'Pop', 'color': 'Neutral'}, False), ({'beverage': 'Other', 'drink': '7UP/Sprite', 'music': 'Pop', 'col

In [76]:
for key in ['color', 'music', 'beverage', 'drink']:
    print(key, partition_entropy_by(inputs, key))

color 0.9877787203441428
music 0.8475272242792208
beverage 0.9705999798596151
drink 0.9733467653099968


In [79]:
music_inputs = [(input, label) for input, label in inputs if input['music'] == "Rock"]
print(music_inputs)

[({'beverage': 'Vodka', 'drink': '7UP/Sprite', 'music': 'Rock', 'color': 'Cool'}, False), ({'beverage': 'Wine', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'color': 'Warm'}, False), ({'beverage': 'Vodka', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'color': 'Cool'}, False), ({'beverage': 'Other', 'drink': '7UP/Sprite', 'music': 'Rock', 'color': 'Warm'}, False), ({'beverage': 'Beer', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'color': 'Cool'}, False), ({'beverage': 'Other', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'color': 'Cool'}, False), ({'beverage': 'Other', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'color': 'Cool'}, False), ({'beverage': 'Vodka', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'color': 'Cool'}, False), ({'beverage': 'Other', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'color': 'Warm'}, False), ({'beverage': 'Beer', 'drink': 'Coca Cola/Pepsi', 'music': 'Rock', 'color': 'Neutral'}, False), ({'beverage': "Doesn't drink", 'drink': 'Coca Cola/Pepsi', 'music': '

In [81]:
for key in ['color', 'beverage', 'drink']:
    print(key, partition_entropy_by(music_inputs, key))

color 0.9923554195933337
beverage 0.6057776318066809
drink 0.8682145190209152


In [106]:
def classify(tree, input):
    if tree in [True, False]:
        return tree
    
    attribute, subtree_dict = tree
    
    subtree_key = input.get(attribute)
    
    if subtree_key not in subtree_dict:
        subtree_key = None
        
    subtree = subtree_dict[subtree_key]
    return classify(subtree, input)

In [102]:
def build_tree_id3(inputs, split_people=None):
    if split_people is None:
        split_people = inputs[0][0].keys()
    
    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues
    
    if num_trues == 0: return False
    if num_falses == 0: return True
    
    if not split_people:
        return num_trues >= num_falses
    
    best_attribute = min(split_people, key=partial(partition_entropy_by, inputs))
    partitions = partition_by(inputs, best_attribute)
    new_people = [a for a in split_people if a != best_attribute]
    
    subtrees = { attribute_value: build_tree_id3(subset, new_people)
               for attribute_value, subset in partitions.items() }
    
    subtrees[None] = num_trues > num_falses
    return (best_attribute, subtrees)

In [104]:
tree = build_tree_id3(inputs)
print(tree)

('music', {'Pop': ('beverage', {'Wine': False, None: False, "Doesn't drink": ('color', {'Neutral': True, None: False, 'Cool': False}), 'Beer': False, 'Whiskey': ('color', {None: True, 'Cool': True, 'Warm': False}), 'Other': ('drink', {'Fanta': True, 'Coca Cola/Pepsi': False, '7UP/Sprite': False, None: False})}), 'R&B and soul': ('beverage', {None: True, 'Wine': True, "Doesn't drink": True, 'Beer': False, 'Whiskey': ('drink', {None: False, 'Coca Cola/Pepsi': False, '7UP/Sprite': True})}), 'Folk/Traditional': ('beverage', {None: False, 'Other': True, 'Beer': True, 'Whiskey': False}), 'Electronic': ('beverage', {'Wine': False, None: True, 'Vodka': True, "Doesn't drink": ('drink', {'Fanta': ('color', {None: False, 'Cool': True}), None: False}), 'Beer': True, 'Whiskey': True, 'Other': True}), None: False, 'Hip hop': ('beverage', {None: True, 'Wine': True, "Doesn't drink": True, 'Beer': True, 'Vodka': False}), 'Jazz/Blues': ('beverage', {None: False, 'Wine': False, "Doesn't drink": False, 'W

In [112]:
classify(tree, {"color": "Cool", "music": "Hip hop", "beverage": "Beer", "drink": "7UP/Sprite" })

True