## Printing steps for creating decission tree

In [1]:
# Import required libraries
import pandas as pd
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
import numpy as np

# Load the Iris dataset
iris = datasets.load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['target'] = iris.target

from sklearn.tree import export_graphviz
import pydotplus


iris = datasets.load_iris()
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,random_state=1)

alg = DecisionTreeClassifier()

alg.fit(x_train,y_train)

dot_data = export_graphviz(alg,out_file=None,feature_names = iris.feature_names,class_names = iris.target_names)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("iris_dt.pdf")

# Function to calculate entropy
def calculate_entropy(data):
    # Calculate the proportion of each class
    class_proportions = data['target'].value_counts() / len(data)
    
    # Calculate entropy
    entropy = -sum(class_proportions * np.log2(class_proportions))
    return entropy

# Function to calculate information gain
def calculate_information_gain(data, feature):
    # Calculate the initial entropy
    initial_entropy = calculate_entropy(data)
    
    # Calculate the entropy after splitting on the feature
    feature_values = data[feature].unique()
    new_entropy = 0
    for value in feature_values:
        subset = data[data[feature] == value]
        subset_proportion = len(subset) / len(data)
        subset_entropy = calculate_entropy(subset)
        new_entropy += subset_proportion * subset_entropy
    
    # Calculate information gain
    information_gain = initial_entropy - new_entropy
    return information_gain


# Function to find the best feature to split on
def find_best_split(data):
    features = data.columns[:-1]
    best_feature = None
    best_gain_ratio = -1
    
    # Iterate over features and calculate gain ratio
    for feature in features:
        gain = calculate_information_gain(data, feature)
        
        # Calculate split information
        feature_counts = data[feature].value_counts()
        split_info = -sum((feature_counts / len(data)) * np.log2(feature_counts / len(data)))
        
        # Check if split information is zero
        if split_info == 0:
            gain_ratio = 0
        else:
            # Calculate gain ratio
            gain_ratio = gain / split_info
        
        # Update the best feature and gain ratio if necessary
        if gain_ratio > best_gain_ratio:
            best_feature = feature
            best_gain_ratio = gain_ratio
    
    return best_feature, best_gain_ratio


# Function to build the decision tree
def build_decision_tree(data, level=0):
    # Count the number of instances for each class
    class_counts = data['target'].value_counts()
    
    # Print the counts and entropy at the current level
    print("Level", level)
    for class_label, count in class_counts.items():
        print("Count of", class_label, "=", count)
    
    # Calculate and print the current entropy
    entropy = calculate_entropy(data)
    entropy = 0 if entropy == 0 else entropy  # Replace -0 with 0
    print("Current Entropy is =", entropy)
    
    # Check if entropy is 0 or all instances belong to the same class
    if entropy == 0 or len(class_counts) == 1:
        print("Reached leaf Node")
        print()
        return
    
    # Find the best feature to split on
    best_feature, gain_ratio = find_best_split(data)
    
    # Print the best feature and gain ratio
    print("Splitting on feature", best_feature, "with gain ratio", gain_ratio)
    print()
    
    # Split the data based on the best feature
    feature_values = data[best_feature].unique()
    for value in feature_values:
        subset = data[data[best_feature] == value]
        build_decision_tree(subset, level+1)

    
   

build_decision_tree(data)


Level 0
Count of 0 = 50
Count of 1 = 50
Count of 2 = 50
Current Entropy is = 1.584962500721156
Splitting on feature petal width (cm) with gain ratio 0.3545578136119045

Level 1
Count of 0 = 29
Current Entropy is = 0
Reached leaf Node

Level 1
Count of 0 = 7
Current Entropy is = 0
Reached leaf Node

Level 1
Count of 0 = 7
Current Entropy is = 0
Reached leaf Node

Level 1
Count of 0 = 5
Current Entropy is = 0
Reached leaf Node

Level 1
Count of 0 = 1
Current Entropy is = 0
Reached leaf Node

Level 1
Count of 0 = 1
Current Entropy is = 0
Reached leaf Node

Level 1
Count of 1 = 7
Count of 2 = 1
Current Entropy is = 0.5435644431995964
Splitting on feature petal length (cm) with gain ratio 0.21742577727983856

Level 2
Count of 1 = 2
Current Entropy is = 0
Reached leaf Node

Level 2
Count of 1 = 1
Current Entropy is = 0
Reached leaf Node

Level 2
Count of 1 = 2
Current Entropy is = 0
Reached leaf Node

Level 2
Count of 1 = 1
Current Entropy is = 0
Reached leaf Node

Level 2
Count of 1 = 1
Cur

## Attempt to create the actual tree for iris dataset

In [18]:
import numpy as np

class DecisionTree:
    def __init__(self):
        self.root = None

    class Node:
        def __init__(self, feature=None, threshold=None, label=None):
            self.feature = feature
            self.threshold = threshold
            self.label = label
            self.left = None
            self.right = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def predict(self, X):
        predictions = []
        for sample in X:
            predictions.append(self._traverse_tree(sample, self.root))
        return predictions

    def _build_tree(self, X, y):
        if len(set(y)) == 1:
            # Leaf node: all samples belong to the same class
            label = y[0]
            return self.Node(label=label)

        best_feature, best_threshold = self._find_best_split(X, y)
        left_indices = X[:, best_feature] < best_threshold
        right_indices = ~left_indices

        left_child = self._build_tree(X[left_indices], y[left_indices])
        right_child = self._build_tree(X[right_indices], y[right_indices])

        node = self.Node(feature=best_feature, threshold=best_threshold)
        node.left = left_child
        node.right = right_child
        return node

    def _find_best_split(self, X, y):
        best_feature = None
        best_threshold = None
        best_gini = 1.0

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] < threshold
                right_indices = ~left_indices

                gini = self._calculate_gini(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _calculate_gini(self, left_labels, right_labels):
        left_gini = self._gini_impurity(left_labels)
        right_gini = self._gini_impurity(right_labels)
        total_samples = len(left_labels) + len(right_labels)
        weighted_gini = (len(left_labels) / total_samples) * left_gini + (len(right_labels) / total_samples) * right_gini
        return weighted_gini

    def _gini_impurity(self, labels):
        _, counts = np.unique(labels, return_counts=True)
        probabilities = counts / len(labels)
        gini = 1 - np.sum(probabilities**2)
        return gini

    def _traverse_tree(self, sample, node):
        if node.label is not None:
            return node.label

        if sample[node.feature] < node.threshold:
            return self._traverse_tree(sample, node.left)
        else:
            return self._traverse_tree(sample, node.right)

    def _print_node(self, node, indent=""):
        if node.label is not None:
            print(indent + "Class:", node.label)
        else:
            print(indent + "Feature:", node.feature)
            print(indent + "Threshold:", node.threshold)
            print(indent + "Left:")
            self._print_node(node.left, indent + "  ")
            print(indent + "Right:")
            self._print_node(node.right, indent + "  ")

    def print_tree(self):
        self._print_node(self.root)


In [19]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a DecisionTree instance
clf = DecisionTree()

# Fit the decision tree to the data
clf.fit(X, y)

# Output the decision tree structure
clf.print_tree()


Feature: 2
Threshold: 3.0
Left:
  Class: 0
Right:
  Feature: 3
  Threshold: 1.8
  Left:
    Feature: 2
    Threshold: 5.0
    Left:
      Feature: 3
      Threshold: 1.7
      Left:
        Class: 1
      Right:
        Class: 2
    Right:
      Feature: 3
      Threshold: 1.6
      Left:
        Class: 2
      Right:
        Feature: 0
        Threshold: 7.2
        Left:
          Class: 1
        Right:
          Class: 2
  Right:
    Feature: 2
    Threshold: 4.9
    Left:
      Feature: 0
      Threshold: 6.0
      Left:
        Class: 1
      Right:
        Class: 2
    Right:
      Class: 2


## Creating OR Tree

In [22]:
import pydotplus
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

# Create the OR logic dataset
X = [[0, 0], [0, 1], [1, 0], [1, 1]]
y = [0, 1, 1, 1]

# Build the decision tree classifier
alg = DecisionTreeClassifier()
alg.fit(X, y)

# Export the decision tree as a DOT file
dot_data = export_graphviz(alg, out_file=None, feature_names=['Input 1', 'Input 2'], class_names=['False', 'True'])
graph = pydotplus.graph_from_dot_data(dot_data)

# Visualize the decision tree as a PDF file
graph.write_pdf("decision_tree_or.pdf")


True