<a href="https://colab.research.google.com/github/AsraniSanjana/All_Codes/blob/main/All_Semester_Codes/ML_sem7/models/practice_id3_c4_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
class Node:
    """Contains the information of the node and another nodes of the Decision Tree."""

    def __init__(self):
        self.value = None
        self.next = None
        self.childs = None

class DecisionTreeClassifier:
    """Decision Tree Classifier using ID3 algorithm."""

    def __init__(self, X, feature_names, labels):
        self.X = X  # features or predictors
        self.feature_names = feature_names  # name of the features
        self.labels = labels  # categories
        self.labelCategories = list(set(labels))  # unique categories
        # number of instances of each category
        self.labelCategoriesCount = [list(labels).count(x) for x in self.labelCategories]
        self.node = None  # nodes
        # calculate the initial entropy of the system
        self.entropy = self._get_entropy([x for x in range(len(self.labels))])


    def _get_entropy(self, x_ids):
        """ Calculates the entropy.
        Parameters
        __________
        :param x_ids: list, List containing the instances ID's
        __________
        :return: entropy: float, Entropy.
        """
        # sorted labels by instance id
        labels = [self.labels[i] for i in x_ids]
        # count number of instances of each category
        label_count = [labels.count(x) for x in self.labelCategories]
        # calculate the entropy for each category and sum them
        entropy = sum([-count / len(x_ids) * math.log(count / len(x_ids), 2)
                      if count else 0
                      for count in label_count
                      ])

        return entropy

    def _get_information_gain(self, x_ids, feature_id):
        """Calculates the information gain for a given feature based on its entropy and the total entropy of the system.
        Parameters
        __________
        :param x_ids: list, List containing the instances ID's
        :param feature_id: int, feature ID
        __________
        :return: info_gain: float, the information gain for a given feature.
        """
        # calculate total entropy
        info_gain = self._get_entropy(x_ids)
        # store in a list all the values of the chosen feature
        x_features = [self.X[x][feature_id] for x in x_ids]
        # get unique values
        feature_vals = list(set(x_features))
        # get frequency of each value
        feature_v_count = [x_features.count(x) for x in feature_vals]
        # get the feature values ids
        feature_v_id = [
            [x_ids[i]
            for i, x in enumerate(x_features)
            if x == y]
            for y in feature_vals
        ]

        # compute the information gain with the chosen feature
        info_gain_feature = sum([v_counts / len(x_ids) * self._get_entropy(v_ids)
                            for v_counts, v_ids in zip(feature_v_count, feature_v_id)])

        info_gain = info_gain - info_gain_feature

        return info_gain


    def _get_feature_max_information_gain(self, x_ids, feature_ids):
        """Finds the attribute/feature that maximizes the information gain.
        Parameters
        __________
        :param x_ids: list, List containing the samples ID's
        :param feature_ids: list, List containing the feature ID's
        __________
        :returns: string and int, feature and feature id of the feature that maximizes the information gain
        """
        # get the entropy for each feature
        features_entropy = [self._get_information_gain(x_ids, feature_id) for feature_id in feature_ids]
        # find the feature that maximises the information gain
        max_id = feature_ids[features_entropy.index(max(features_entropy))]

        return self.feature_names[max_id], max_id


    def id3(self):
        """Initializes ID3 algorithm to build a Decision Tree Classifier.
        :return: None
        """
        # assign an unique number to each instance
        x_ids = [x for x in range(len(self.X))]
        # assign an unique number to each featuer
        feature_ids = [x for x in range(len(self.feature_names))]
        # define node variable - instance of the class Node
        self.node = self._id3_recv(x_ids, feature_ids, self.node)


    def _id3_recv(self, x_ids, feature_ids, node):
        """ID3 algorithm. It is called recursively until some criteria is met.
        Parameters
        __________
        :param x_ids: list, list containing the samples ID's
        :param feature_ids: list, List containing the feature ID's
        :param node: object, An instance of the class Nodes
        __________
        :returns: An instance of the class Node containing all the information of the nodes in the Decision Tree
        """
        if not node:
            node = Node()  # initialize nodes
        # sorted labels by instance id
        labels_in_features = [self.labels[x] for x in x_ids]
        # if all the example have the same class (pure node), return node
        if len(set(labels_in_features)) == 1:
            node.value = self.labels[x_ids[0]]
            return node
        # if there are not more feature to compute, return node with the most probable class
        if len(feature_ids) == 0:
            node.value = max(set(labels_in_features), key=labels_in_features.count)  # compute mode
            return node
        # else...
        # choose the feature that maximizes the information gain
        best_feature_name, best_feature_id = self._get_feature_max_information_gain(x_ids, feature_ids)
        node.value = best_feature_name
        node.childs = []
        # value of the chosen feature for each instance
        feature_values = list(set([self.X[x][best_feature_id] for x in x_ids]))
        # loop through all the values
        for value in feature_values:
            child = Node()
            child.value = value  # add a branch from the node to each feature value in our feature
            node.childs.append(child)  # append new child node to current node
            child_x_ids = [x for x in x_ids if self.X[x][best_feature_id] == value]
            if not child_x_ids:
                child.next = max(set(labels_in_features), key=labels_in_features.count)
                print('')
            else:
                if feature_ids and best_feature_id in feature_ids:
                    to_remove = feature_ids.index(best_feature_id)
                    feature_ids.pop(to_remove)
                # recursively call the algorithm
                child.next = self._id3_recv(child_x_ids, feature_ids, child.next)

    # if all the example have the same class (pure node), return node
                if len(set(labels_in_features)) == 1:
                    node.value = self.labels[x_ids[0]]
                    return node

                # if there are not more feature to compute, return node with the most probable class
                if len(feature_ids) == 0:
                    node.value = max(set(labels_in_features), key=labels_in_features.count)  # compute mode
                    return node

                  # else...
        # choose the feature that maximizes the information gain
                best_feature_name, best_feature_id = self._get_feature_max_information_gain(x_ids, feature_ids)
                node.value = best_feature_name
                node.childs = []
                # value of the chosen feature for each instance
                feature_values = list(set([self.X[x][best_feature_id] for x in x_ids]))
                # loop through all the values
                for value in feature_values:
                    child = Node()
                    child.value = value  # add a branch from the node to each feature value in our feature
                    node.childs.append(child)  # append new child node to current node
                    child_x_ids = [x for x in x_ids if self.X[x][best_feature_id] == value] # instances that take the branch
                    if not child_x_ids:
                        child.next = max(set(labels_in_features), key=labels_in_features.count)
                        print('')
                    else:
                        if feature_ids and best_feature_id in feature_ids:
                            to_remove = feature_ids.index(best_feature_id)
                            feature_ids.pop(to_remove)
                        # recursively call the algorithm
                        child.next = self._id3_recv(child_x_ids, feature_ids, child.next)
                return node
        return node

In [56]:
import pandas as pd
import math

# Define the Node class and DecisionTreeClassifier class here

# Load the dataset
data_df = pd.read_csv("Buy_Computer.csv")


# Extract features, feature_names, and labels from the dataset
X = data_df[['age', 'income', 'student', 'credit_rating']].values
feature_names = ['age', 'income', 'student', 'credit_rating']
labels = data_df['Buy_Computer'].values

# Create an instance of DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(X, feature_names, labels)

# Run the ID3 algorithm
dt_classifier.id3()


In [52]:
import pandas as pd
import math

data_df = pd.read_csv("happy_sad.csv") # Buy_computer here is the dataset name

class_labels = data_df['emotions'] # Buy_Computer here is the last colmn name : class label : yes or no
dt_classifier = DecisionTreeClassifier(None,None,class_labels)  # Pass None for X and feature_names


**FOR ID3:**


1.   ENTROPY
2.   INFO GAIN
3.   GAIN

**FOR C4.5:**

1.   GAIN RATIO
2.   SPLIT INFO


**FOR CART:**

1.   GINI INDEX
2.   DELTA GINI

To check intermediate calculations:

In [58]:

from tabulate import tabulate
def getGain(attribute_name):
    attribute_column = data_df[attribute_name]
    class_labels = data_df['Buy_Computer']

    # Create an instance of DecisionTreeClassifier
    dt_classifier = DecisionTreeClassifier(None, None, class_labels)  # Pass None for X and feature_names

    # Calculate entropy for the attribute values
    attribute_values = attribute_column.unique()

    entropies = []
    weighted_entropies = []
    values_counts = {}
    for value in attribute_values:
        subset_indices = attribute_column[attribute_column == value].index
        entropy = dt_classifier._get_entropy(subset_indices)
        entropies.append(entropy)
        weight = len(subset_indices) / len(attribute_column)
        weighted_entropy = entropy * weight
        weighted_entropies.append(weighted_entropy)
        values_counts[value] = len(subset_indices)


    # Calculate information gain for the attribute
    total_entropy = dt_classifier._get_entropy(attribute_column.index)
    total_weighted_entropy = sum(weighted_entropies)
    info_gain = total_weighted_entropy

    # Calculate and display the final gain for the attribute
    dataset_entropy = dt_classifier._get_entropy(attribute_column.index)
    final_gain = dataset_entropy - info_gain


    # Calculate split info for the attribute
    total_tuples = len(data_df)
    split_info = 0
    for value_count in values_counts.values():
        ratio = value_count / total_tuples
        split_info -= ratio * math.log2(ratio)

    gain_ratio = final_gain / split_info


    header = [attribute_name] + list(set(class_labels)) + ['Total', 'Entropy', 'Weight']
    column_width = max(len(name) for name in header) + 2
    table_format = '|'.join('{{:<{}}}'.format(column_width) for _ in range(len(header)))

    # Print the header
    print("Values Counts, Entropies, and Weights for '{}' attribute:".format(attribute_name))
    print(table_format.format(*header))
    print('|' + '=' * (column_width * len(header) + len(header) - 1) + '|')

    # Print the rows
    for value in values_counts:
        subset_indices = attribute_column[attribute_column == value].index
        class_counts = [sum((class_labels[index] == label) for index in subset_indices) for label in header[1:-3]]
        total_count = values_counts[value]
        entropy = calculate_entropy(class_labels[subset_indices])
        weight = total_count / total_tuples
        print(table_format.format(value, *class_counts, total_count, entropy, weight))

    print(f"\nHence, Gain({attribute_name}) becomes {dataset_entropy:.4f} - {info_gain:.4f} = {final_gain:.4f}")
    print(f"Split Info({attribute_name}) = {split_info:.4f}")
    print(f"Gain Ratio({attribute_name}) = {final_gain:.4f} / {split_info:.4f} = {gain_ratio:.4f}")
    print("\n_____________________________________________________________________________________________________________________\n")

# getGain('color')
# getGain('wig')
# getGain('num_of_ears')
#  or just do :
for attribute_name in feature_names:
    getGain(attribute_name)

Values Counts, Entropies, and Weights for 'age' attribute:
age      |no       |yes      |Total    |Entropy  |Weight   
youth    |3        |2        |5        |0.9709505944546686|0.35714285714285715
middle_age|0        |4        |4        |-0.0     |0.2857142857142857
senior   |2        |3        |5        |0.9709505944546686|0.35714285714285715

Hence, Gain(age) becomes 0.9403 - 0.6935 = 0.2467
Split Info(age) = 1.5774
Gain Ratio(age) = 0.2467 / 1.5774 = 0.1564

_____________________________________________________________________________________________________________________

Values Counts, Entropies, and Weights for 'income' attribute:
income   |no       |yes      |Total    |Entropy  |Weight   
high     |2        |2        |4        |1.0      |0.2857142857142857
medium   |2        |4        |6        |0.9182958340544896|0.42857142857142855
low      |1        |3        |4        |0.8112781244591328|0.2857142857142857

Hence, Gain(income) becomes 0.9403 - 0.9111 = 0.0292
Split Info(i

In [84]:
def process_subdataset(data, attribute_name):
    attribute_values = data[attribute_name].unique()

    for value in attribute_values:
        print(f"Subdataset with '{attribute_name}' = '{value}':")
        subset_data = data[data[attribute_name] == value].reset_index(drop=True)

        print(subset_data)  # Display the subset data
        print("=" * 60 + "\n")


# Process each attribute
feature_names = ['age', 'income', 'student', 'credit_rating']
for attribute_name in feature_names:
    process_subdataset(data_df, attribute_name)

Subdataset with 'age' = 'youth':
   id    age  income student credit_rating Buy_Computer
0   1  youth    high      no          fair           no
1   2  youth    high      no     excellent           no
2   8  youth  medium      no          fair           no
3   9  youth     low     yes          fair          yes
4  11  youth  medium     yes     excellent          yes

Subdataset with 'age' = 'middle_age':
   id         age  income student credit_rating Buy_Computer
0   3  middle_age    high      no          fair          yes
1   7  middle_age     low     yes     excellent          yes
2  12  middle_age  medium      no     excellent          yes
3  13  middle_age    high     yes          fair          yes

Subdataset with 'age' = 'senior':
   id     age  income student credit_rating Buy_Computer
0   4  senior  medium      no          fair          yes
1   5  senior     low     yes          fair          yes
2   6  senior     low     yes     excellent           no
3  10  senior  medium   

Final code.... You can call the functions getGain(), process_subdatasets() as well to print the whole walk-through of algorithm.

In [105]:
import pandas as pd
import math

# Load the dataset
data_df = pd.read_csv("Buy_Computer.csv")  # Replace with your dataset

# Define the Node class
class Node:
    def __init__(self, value=None):
        self.value = value
        self.branches = {}

# Calculate the entropy of a dataset
def calculate_entropy(data_df, target_column):
    class_counts = data_df[target_column].value_counts()
    total_instances = len(data_df)
    entropy = 0
    for class_count in class_counts:
        class_probability = class_count / total_instances
        entropy -= class_probability * math.log2(class_probability)
    return entropy

# Calculate the information gain for an attribute
def calculate_information_gain(data_df, attribute, target_column):
    total_entropy = calculate_entropy(data_df, target_column)
    attribute_values = data_df[attribute].unique()
    weighted_entropy = 0
    for value in attribute_values:
        subset = data_df[data_df[attribute] == value]
        subset_entropy = calculate_entropy(subset, target_column)
        value_probability = len(subset) / len(data_df)
        weighted_entropy += value_probability * subset_entropy
    information_gain = total_entropy - weighted_entropy
    return information_gain

# Function to print intermediate results
def print_intermediate_results(attribute, information_gain, selected_tuples):
    print(f"Chosen Attribute: {attribute}")
    print(f"Information Gain: {information_gain:.4f}")
    print(f"Selected Tuples:")
    print(selected_tuples)
    print("=" * 40)

# Build the decision tree recursively
def build_decision_tree(data_df, target_column, attributes):
    # Stopping conditions
    if data_df[target_column].nunique() == 1:
        return Node(data_df[target_column].iloc[0])
    if len(attributes) == 0:
        majority_class = data_df[target_column].mode()[0]
        return Node(majority_class)

    # Calculate information gain for each attribute
    information_gains = [calculate_information_gain(data_df, attribute, target_column) for attribute in attributes]
    selected_attribute_idx = information_gains.index(max(information_gains))
    selected_attribute = attributes[selected_attribute_idx]

    root = Node(selected_attribute)
    remaining_attributes = [attribute for idx, attribute in enumerate(attributes) if idx != selected_attribute_idx]
    attribute_values = data_df[selected_attribute].unique()
    for value in attribute_values:
        subset = data_df[data_df[selected_attribute] == value]
        if len(subset) == 0:
            majority_class = data_df[target_column].mode()[0]
            root.branches[value] = Node(majority_class)
        else:
            print_intermediate_results(selected_attribute, information_gains[selected_attribute_idx], subset)
            root.branches[value] = build_decision_tree(subset, target_column, remaining_attributes)

    return root

# Call the build_decision_tree function
target_column = "Buy_Computer"
attributes = ["age", "income", "student", "credit_rating"]
decision_tree = build_decision_tree(data_df, target_column, attributes)

# Function to print the decision tree
def print_decision_tree(node, depth=0):
    if node.value is not None:
        print("  " * depth, node.value)
    for value, branch in node.branches.items():
        print("  " * (depth + 1), value, end=": ")
        print_decision_tree(branch, depth + 1)

# Print the decision tree
print_decision_tree(decision_tree)


Chosen Attribute: age
Information Gain: 0.2467
Selected Tuples:
    id    age  income student credit_rating Buy_Computer
0    1  youth    high      no          fair           no
1    2  youth    high      no     excellent           no
7    8  youth  medium      no          fair           no
8    9  youth     low     yes          fair          yes
10  11  youth  medium     yes     excellent          yes
Chosen Attribute: student
Information Gain: 0.9710
Selected Tuples:
   id    age  income student credit_rating Buy_Computer
0   1  youth    high      no          fair           no
1   2  youth    high      no     excellent           no
7   8  youth  medium      no          fair           no
Chosen Attribute: student
Information Gain: 0.9710
Selected Tuples:
    id    age  income student credit_rating Buy_Computer
8    9  youth     low     yes          fair          yes
10  11  youth  medium     yes     excellent          yes
Chosen Attribute: age
Information Gain: 0.2467
Selected Tuples: