# **Classification problem using decison trees**

In [1]:
import pandas as pd
import numpy as np

data = {
    'Income': ['High', 'Low', 'High', 'Low'],
    'Credit Score': ['Good', 'Bad', 'Bad', 'Good'],
    'Loan Approval': ['Yes', 'No', 'Yes', 'No']
}

df = pd.DataFrame(data)


calculate entropy

In [2]:
def calculate_entropy(labels):
    counts = labels.value_counts().to_numpy()
    probabilities = counts / counts.sum()
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-9))
    return entropy

weighted entropy for the information gain

In [3]:
def calculate_weighted_entropy(feature, target):
    unique_values = feature.unique()
    total_entropy = 0
    total_count = len(feature)

    for value in unique_values:
        subset = target[feature == value]
        weight = len(subset) / total_count
        subset_entropy = calculate_entropy(subset)
        total_entropy += weight * subset_entropy

    return total_entropy

calculate information gain

In [4]:
def calculate_information_gain(feature, target):
    original_entropy = calculate_entropy(target)
    weighted_entropy = calculate_weighted_entropy(feature, target)
    information_gain = original_entropy - weighted_entropy
    return information_gain

find which feature to split at

In [5]:
def find_best_split(data, target):
    best_feature = None
    best_gain = -1

    for feature in data.columns:
        gain = calculate_information_gain(data[feature], target)
        if gain > best_gain:
            best_gain = gain
            best_feature = feature

    return best_feature

built tree

In [6]:
def build_tree(data, target):
    if len(target.unique()) == 1:
        return target.iloc[0]

    if data.empty:
        return target.mode().iloc[0]

    # Find the best feature to split on
    best_feature = find_best_split(data, target)
    if best_feature is None:
        return target.mode().iloc[0]

    tree = {best_feature: {}}

    for value in data[best_feature].unique():
        subset_data = data[data[best_feature] == value].drop(columns=[best_feature])
        subset_target = target[data[best_feature] == value]
        subtree = build_tree(subset_data, subset_target)
        tree[best_feature][value] = subtree

    return tree

target_column = 'Loan Approval'
tree = build_tree(df.drop(columns=[target_column]), df[target_column])

import pprint
pprint.pprint(tree)


{'Income': {'High': 'Yes', 'Low': 'No'}}
