# Decision tree

# install and import library

In [None]:
!pip install -q tfds-nightly tensorflow matplotlib

In [7]:
!pip install tensorflow-datasets

In [9]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from tensorflow.keras.datasets import mnist

# Define a class for the decision tree node

In [10]:
class Node:
  def __init__(self, depth):
    # Initialize the node attributes
    self.depth = depth # The depth of the node in the tree
    self.split_feature = None # The feature to split on
    self.split_value = None # The value to split on
    self.left = None # The left child node
    self.right = None # The right child node
    self.label = None # The label of the node if it is a leaf

  def _is_leaf(self):
    # Check if the node is a leaf
    return self.label is not None

  def entropy(self, y):
    # Calculate the entropy of a label array y
    _, counts = np.unique(y, return_counts=True)
    probs = counts / len(y)
    return -np.sum(probs * np.log2(probs))

  def information_gain(self, x, y):
    # Calculate the information gain of a feature array x and a label array y
    # Information gain is the reduction in entropy after splitting on x
    entropy_before = self.entropy(y)
    # Split the data into two subsets based on x
    left_mask = x <= np.median(x)
    right_mask = x > np.median(x)
    y_left = y[left_mask]
    y_right = y[right_mask]
    # Calculate the entropy of each subset
    entropy_left = self.entropy(y_left)
    entropy_right = self.entropy(y_right)
    # Calculate the weighted average of the entropy after splitting
    entropy_after = (len(y_left) / len(y)) * entropy_left + (len(y_right) / len(y)) * entropy_right
    # Return the information gain
    return entropy_before - entropy_after

  def information_gains(self, X, y):
    # Calculate the information gain of each feature in X and a label array y
    # Return a numpy array of information gains
    gains = []
    for i in range(X.shape[1]):
      x = X[:, i]
      gain = self.information_gain(x, y)
      gains.append(gain)
    return np.array(gains)

  def fit(self, X_train, y_train):
    # Fit the node to the training data
    # If the node is pure (only one label) or the depth limit is reached, make it a leaf node and assign the majority label
    if len(np.unique(y_train)) == 1 or self.depth == 0:
      self.label = np.bincount(y_train).argmax()
      return
    # Otherwise, find the best feature to split on based on information gain
    gains = self.information_gains(X_train, y_train)
    self.split_feature = gains.argmax()
    self.split_value = np.median(X_train[:, self.split_feature])
    # Split the data into two subsets based on the best feature
    left_mask = X_train[:, self.split_feature] <= self.split_value
    right_mask = X_train[:, self.split_feature] > self.split_value
    X_left = X_train[left_mask]
    y_left = y_train[left_mask]
    X_right = X_train[right_mask]
    y_right = y_train[right_mask]
    # Create two child nodes and recursively fit them to the subsets
    self.left = Node(self.depth - 1)
    self.left.fit(X_left, y_left)
    self.right = Node(self.depth - 1)
    self.right.fit(X_right, y_right)

  def predict(self, X):
    # Predict the label for a single input array X
    # If the node is a leaf, return its label
    if self._is_leaf():
      return self.label
    # Otherwise, traverse to the left or right child node based on the split feature and value
    if X[self.split_feature] <= self.split_value:
      return self.left.predict(X)
    else:
      return self.right.predict(X)

# Training & Testing

In [11]:
# Load the MNIST dataset and scale it to [0, 1]
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train / 255.0
x_test = x_test / 255.0

# Reshape the data from 28x28 matrix to 784 array
x_train = x_train.reshape(-1, 784)
x_test = x_test.reshape(-1, 784)

# Initialize the PCA and fit it on the training data
pca = PCA(n_components=10)
pca.fit(x_train)

# Transform the training and test data using PCA
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

# Convert the reduced datasets types to dataframe using pd
x_train_pca = pd.DataFrame(x_train_pca)
x_test_pca = pd.DataFrame(x_test_pca)

# Initialize the decision tree node with depth 10
dt = Node(depth=10)

# Train the node on the training data
dt.fit(x_train_pca.values, y_train)

# Predict the labels for the test data
y_pred = [dt.predict(x) for x in x_test_pca.values]

# Report the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Accuracy: 0.8019
