In [1]:
import pandas as pd
import math
import numpy as np

In [2]:
# Creating the dataset
data = {
    'ID': list(range(1, 16)),
    'Age': ['Young', 'Young', 'Young', 'Young', 'Young', 'middle', 'middle', 'middle', 'middle', 'middle', 'old', 'old', 'old', 'old', 'old'],
    'Has_job': [False, False, True, True, False, False, False, True, False, False, False, False, True, True, False],
    'Own_House': [False, False, False, True, True, False, False, True, True, True, True, True, False, False, False],
    'Credit_Rating': ['fair', 'good', 'good', 'fair', 'fair', 'fair', 'good', 'good', 'excellent', 'excellent', 'excellent', 'good', 'good', 'excellent', 'fair'],
    'Class': ['No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
df = pd.DataFrame(data)
df

Unnamed: 0,ID,Age,Has_job,Own_House,Credit_Rating,Class
0,1,Young,False,False,fair,No
1,2,Young,False,False,good,No
2,3,Young,True,False,good,Yes
3,4,Young,True,True,fair,Yes
4,5,Young,False,True,fair,No
5,6,middle,False,False,fair,No
6,7,middle,False,False,good,No
7,8,middle,True,True,good,Yes
8,9,middle,False,True,excellent,Yes
9,10,middle,False,True,excellent,Yes


In [3]:
# Calculating the entropy of the dataset
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = np.sum(
        [-counts[i] / np.sum(counts) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))]
    )
    return entropy_val


In [4]:
# Calculating the information gain
def InfoGain(data, split_attribute_name, target_name):
    total_entropy = entropy(data[target_name])

    vals, counts = np.unique(data[split_attribute_name], return_counts=True)

    Weighted_Entropy = np.sum(
        [
            counts[i] / np.sum(counts) * entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name])
            for i in range(len(vals))
        ]
    )

    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain




In [5]:
def ID3(data, originaldata, features, target_attribute_name, parent_node_class=None):
# If all target_values have the same value, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]

    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[
            np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])
        ]

    elif len(features) == 0:
        return parent_node_class

    else:
        parent_node_class = np.unique(data[target_attribute_name])[
            np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])
        ]

        item_values = [InfoGain(data, feature, target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]

        tree = {best_feature: {}}

        features = [i for i in features if i != best_feature]

        for value in np.unique(data[best_feature]):
            value = value
            sub_data = data.where(data[best_feature] == value).dropna()

            subtree = ID3(sub_data, data, features, target_attribute_name, parent_node_class)

            tree[best_feature][value] = subtree

        return tree



In [6]:
# Run the ID3 algorithm on the dataset
def run_ID3(df):
    features = df.columns[1:-1]
    target_attribute_name = 'Class'
    data = df
    originaldata = df.copy()

    tree = ID3(data, originaldata, features, target_attribute_name)
    return tree

In [7]:
# Printing the decision tree
import numpy as np

decision_tree = run_ID3(df)
print(decision_tree)


{'Credit_Rating': {'excellent': 'Yes', 'fair': {'Has_job': {False: 'No', True: 'Yes'}}, 'good': {'Has_job': {False: {'Age': {'Young': 'No', 'middle': 'No', 'old': 'Yes'}}, True: 'Yes'}}}}
