In [1]:
!pip install keras
!pip install tensorflow
!pip install tensorflow-datasets

In [4]:
# Decision tree + import library
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from keras.datasets import mnist
max_depth = 5 # You can change this value as you like
# def entropy(y: pd.Series):
def entropy(y: pd.Series):
  # Calculate the entropy of a series of labels
  counts = y.value_counts(normalize=True)
  return -np.sum(counts * np.log2(counts))

# def information_gain(x: pd.Series, y: pd.Series): return info_gain
def information_gain(x: pd.Series, y: pd.Series):
  # Calculate the information gain of a feature x given the labels y
  info_gain = entropy(y)
  for value in x.unique():
    y_subset = y[x == value]
    info_gain -= len(y_subset) / len(y) * entropy(y_subset)
  return info_gain

# def information_gains(X: pd.DataFrame, y: pd.Series): return the information gain of all features
def information_gains(X: pd.DataFrame, y: pd.Series):
  # Calculate the information gain of all features in X given the labels y
  info_gains = {}
  for feature in X.columns:
    info_gains[feature] = information_gain(X[feature], y)
  return info_gains

# class Node:
class Node:
  def __init__(self, depth):
    # Each node in the tree is an instance of class `Node` which is capable of predicting and fitting.
    self.depth = depth
    self.best_feature = ''
    self.children = []
    self.threshold = None
    self.choice = None

  def _is_leaf(self):
    # Check if the node is a leaf node
    return len(self.children) == 0

  def fit(self, X_train, y_train):
    # Fit the node to the training data and split into children if necessary
    # If the node is pure or reaches the maximum depth, make it a leaf node
    if len(y_train.unique()) == 1 or self.depth == max_depth:
      self.choice = y_train.mode()[0]
      return

    # Find the best feature to split on based on information gain
    info_gains = information_gains(X_train, y_train)
    self.best_feature = max(info_gains, key=info_gains.get)

    # Find the best threshold to split on based on entropy
    x_best = X_train[self.best_feature]
    thresholds = np.linspace(x_best.min(), x_best.max(), num=10)
    entropies = []
    for t in thresholds:
      y_left = y_train[x_best <= t]
      y_right = y_train[x_best > t]
      entropy_left = entropy(y_left) if len(y_left) > 0 else 0
      entropy_right = entropy(y_right) if len(y_right) > 0 else 0
      weighted_entropy = (len(y_left) * entropy_left + len(y_right) * entropy_right) / len(y_train)
      entropies.append(weighted_entropy)
    self.threshold = thresholds[np.argmin(entropies)]

    # Split the data into left and right subsets based on the best feature and threshold
    X_left = X_train[x_best <= self.threshold]
    y_left = y_train[x_best <= self.threshold]
    X_right = X_train[x_best > self.threshold]
    y_right = y_train[x_best > self.threshold]

    # Create two child nodes and fit them recursively
    left_child = Node(self.depth + 1)
    right_child = Node(self.depth + 1)
    left_child.fit(X_left, y_left)
    right_child.fit(X_right, y_right)
    self.children = [left_child, right_child]

  def predict(self, X):
    # Predict the label for a given input
    # If the node is a leaf node, return its choice
    if self._is_leaf():
      return self.choice

    # Otherwise, traverse to the left or right child based on the best feature and threshold
    x_best = X[self.best_feature]
    if x_best <= self.threshold:
      return self.children[0].predict(X)
    else:
      return self.children[1].predict(X)

# load mnist.keras.dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# scale data to [0 ,1]
x_train = x_train / 255.0
x_test = x_test / 255.0

# reshape data from 28*28 matrix to 784 array
x_train_flat = x_train.reshape(-1, 784)
x_test_flat = x_test.reshape(-1, 784)

# initializing the pca
pca = PCA(n_components=10)

# implement pca on our data with 10 component
pca.fit(x_train_flat)

# select 10 components for train and test data
x_train_pca = pca.transform(x_train_flat)
x_test_pca = pca.transform(x_test_flat)

# convert reduced datasets types to dataframe using pd
x_train_pca = pd.DataFrame(x_train_pca)
x_test_pca = pd.DataFrame(x_test_pca)
y_train = pd.Series(y_train)
y_test = pd.Series(y_test)

# dt = Node(depth=0)
dt = Node(depth=0)

# train dt on mnist
dt.fit(x_train_pca, y_train)

# report model accuracy
y_pred = x_test_pca.apply(dt.predict, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of decision tree on MNIST with PCA: {accuracy:.4f}")


Accuracy of decision tree on MNIST with PCA: 0.3098


روند آموزش یک ساعت و سه دقیقه طول کشید و به خاط زمان طولانی آموزش، از کم بودن دقت صرف نظر می شود