In [1]:
# importing required libraries
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import numpy as np
import pydotplus
from IPython.display import Image
from sklearn import tree

In [2]:
class DecisionTreeSteps:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.clf = DecisionTreeClassifier(criterion='entropy')  # Using entropy criterion
        self.clf.fit(X, y)  # Fit the decision tree to the data

    def calculate_entropy(self, y):
        class_labels, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities + 1e-9))  # Adding small value to avoid log(0)
        return entropy

    def print_tree(self, X, y, node_id=0, level=0):
        entropy = self.calculate_entropy(y)
        class_labels, counts = np.unique(y, return_counts=True)

        # Print current level information
        print(f"Level {level}")
        for label, count in zip(class_labels, counts):
            print(f"Count of {label} = {count}")
        print(f"Current Entropy is = {entropy}")

        # Check if it's a leaf node (all labels are the same)
        if len(class_labels) == 1:
            print("Reached leaf node")
            return

        # Get the splitting feature and gain ratio at the current node
        if hasattr(self.clf, 'tree_'):
            feature = self.clf.tree_.feature[node_id]
            threshold = self.clf.tree_.threshold[node_id]
            left_child = self.clf.tree_.children_left[node_id]
            right_child = self.clf.tree_.children_right[node_id]

            if feature != -2:  # -2 means it's a leaf node
                gain_ratio = self.clf.tree_.impurity[node_id] - self.clf.tree_.impurity[left_child]
                print(f"Splitting on feature {feature} with threshold {threshold} and gain ratio {gain_ratio}")

                # Recursively print the left and right child nodes
                left_indices = np.where(X[:, feature] <= threshold)[0]
                right_indices = np.where(X[:, feature] > threshold)[0]

                # Recur on the left and right subtrees
                self.print_tree(X[left_indices], y[left_indices], left_child, level + 1)
                self.print_tree(X[right_indices], y[right_indices], right_child, level + 1)

    def export_tree_to_pdf(self, filename="decision_tree.pdf"):
        # Export the decision tree to a DOT format
        dot_data = export_graphviz(self.clf, out_file=None, 
                                   feature_names=iris.feature_names,  
                                   class_names=iris.target_names,  
                                   filled=True, rounded=True,  
                                   special_characters=True)  
        # Convert the DOT data to a graph
        graph = pydotplus.graph_from_dot_data(dot_data)
        # Save the graph as a PDF file
        graph.write_pdf(filename)
        print(f"Decision tree exported as {filename}")

    def run(self):
        # Start from the root node
        self.print_tree(self.X, self.y, node_id=0, level=0)

In [3]:
# Load Iris dataset
iris = load_iris()
x, y = iris.data, iris.target

In [4]:
# Instantiate the class and run
tree_steps = DecisionTreeSteps(x, y)
tree_steps.run()

Level 0
Count of 0 = 50
Count of 1 = 50
Count of 2 = 50
Current Entropy is = 1.584962496393071
Splitting on feature 2 with threshold 2.449999988079071 and gain ratio 1.5849625007211563
Level 1
Count of 0 = 50
Current Entropy is = -1.4426951595367387e-09
Reached leaf node
Level 1
Count of 1 = 50
Count of 2 = 50
Current Entropy is = 0.99999999711461
Splitting on feature 3 with threshold 1.75 and gain ratio 0.5549351429491614
Level 2
Count of 1 = 49
Count of 2 = 5
Current Entropy is = 0.44506485416544855
Splitting on feature 2 with threshold 4.950000047683716 and gain ratio 0.2989706069307023
Level 3
Count of 1 = 47
Count of 2 = 1
Current Entropy is = 0.14609424723474632
Splitting on feature 3 with threshold 1.6500000357627869 and gain ratio 0.14609425012013633
Level 4
Count of 1 = 47
Current Entropy is = -1.4426951595367387e-09
Reached leaf node
Level 4
Count of 2 = 1
Current Entropy is = -1.4426951595367387e-09
Reached leaf node
Level 3
Count of 1 = 2
Count of 2 = 4
Current Entropy is =

In [5]:
# Export the decision tree to a PDF
tree_steps.export_tree_to_pdf("decision_tree.pdf")

Decision tree exported as decision_tree.pdf
