In [1]:
import pandas as pd
import math
from collections import Counter
from pprint import pprint

data = [
    ["Sunny",    "Hot",  "High",   "Weak",   "No"],
    ["Sunny",    "Hot",  "High",   "Strong", "No"],
    ["Overcast", "Hot",  "High",   "Weak",   "Yes"],
    ["Rainy",    "Mild", "High",   "Weak",   "Yes"],
    ["Rainy",    "Cool", "Normal", "Weak",   "Yes"],
    ["Rainy",    "Cool", "Normal", "Strong", "No"],
    ["Overcast", "Cool", "Normal", "Strong", "Yes"],
    ["Sunny",    "Mild", "High",   "Weak",   "No"],
    ["Sunny",    "Cool", "Normal", "Weak",   "Yes"],
    ["Rainy",    "Mild", "Normal", "Weak",   "Yes"],
    ["Sunny",    "Mild", "Normal", "Strong", "Yes"],
    ["Overcast", "Mild", "High",   "Strong", "Yes"],
    ["Overcast", "Hot",  "Normal", "Weak",   "Yes"],
    ["Rainy",    "Mild", "High",   "Strong", "No"]
]

columns = ["Outlook","Temperature","Humidity","Wind","Play"]
df = pd.DataFrame(data, columns=columns)

print("Dataset:")
print(df) 


Dataset:
     Outlook Temperature Humidity    Wind Play
0      Sunny         Hot     High    Weak   No
1      Sunny         Hot     High  Strong   No
2   Overcast         Hot     High    Weak  Yes
3      Rainy        Mild     High    Weak  Yes
4      Rainy        Cool   Normal    Weak  Yes
5      Rainy        Cool   Normal  Strong   No
6   Overcast        Cool   Normal  Strong  Yes
7      Sunny        Mild     High    Weak   No
8      Sunny        Cool   Normal    Weak  Yes
9      Rainy        Mild   Normal    Weak  Yes
10     Sunny        Mild   Normal  Strong  Yes
11  Overcast        Mild     High  Strong  Yes
12  Overcast         Hot   Normal    Weak  Yes
13     Rainy        Mild     High  Strong   No


In [2]:
def entropy(data):
    counts = Counter(data["Play"])
    total = len(data)
    ent = 0
    for c in counts.values():
        p = c / total
        ent -= p * math.log2(p)
    return ent

In [3]:

def information_gain(data, attribute):
    total_entropy = entropy(data)
    total_rows = len(data)

    values = data[attribute].unique()
    weighted = 0

    for v in values:
        subset = data[data[attribute] == v]
        weighted += (len(subset)/total_rows) * entropy(subset)

    return total_entropy - weighted

In [None]:
def id3(data, attributes):

    if len(data["Play"].unique()) == 1:
        return data["Play"].iloc[0]

    if len(attributes) == 0:
        return data["Play"].mode()[0]

    gains = {attr: information_gain(data, attr) for attr in attributes}

    best_attr = max(gains, key=gains.get)

    tree = {best_attr: {}}

    for val in data[best_attr].unique():
        subset = data[data[best_attr] == val]

        if subset.empty:
            tree[best_attr][val] = data["Play"].mode()[0]
        else:
            remaining = [a for a in attributes if a != best_attr]
            tree[best_attr][val] = id3(subset, remaining)

    return tree

In [6]:
attributes = ["Outlook","Temperature","Humidity","Wind"]
decision_tree = id3(df, attributes)

print("Decision Tree:")
pprint(decision_tree, width=100)


def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree

    attribute = next(iter(tree))
    value = sample[attribute]

    if value in tree[attribute]:
        return predict(tree[attribute][value], sample)
    else:
        return df["Play"].mode()[0]

Decision Tree:
{'Outlook': {'Overcast': 'Yes',
             'Rainy': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}


In [7]:
test_sample = {
    "Outlook":"Sunny",
    "Temperature":"Cool",
    "Humidity":"Normal",
    "Wind":"Weak"
}

print("Prediction for:", test_sample)
print("Result:", predict(decision_tree, test_sample))

Prediction for: {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'Normal', 'Wind': 'Weak'}
Result: Yes
