### Decision Tree

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [96]:
class Node:
    def __init__(self, feature_id:int=None, threshold:float=None, info_gain:float=None, val:any=None, left: 'Node'=None, right: 'Node'=None)->None:
        self.feature_id=feature_id
        self.threshold=threshold
        self.info_gain=info_gain
        self.left, self.right=left, right
        self.val=val

class DecisionTree:
    def __init__(self, min_splits: int, max_depth: int, mode:str):
        self.root = None
        self.min_splits = min_splits
        self.max_depth = max_depth
        self.mode=mode

    def build(self, dataset: np.array, cur_depth: int = 0):
        X: np.array = dataset[:, :-1]
        y: np.array = dataset[:, -1]

        n_samples, n_features = X.shape
        if n_samples < self.min_splits or cur_depth > self.max_depth:
            leaf_val = self.cal_leaf_val(y)
            return Node(val=leaf_val)

        best_split = self.get_best_split(dataset, n_features)
        left_tree = self.build(best_split['left_data'], cur_depth + 1)
        right_tree = self.build(best_split['right_data'], cur_depth + 1)

        return Node(best_split['feature_id'], best_split['threshold'], best_split['info_gain'], None, left_tree,
                    right_tree)

    def get_best_split(self, dataset: np.array, n_features: int)->dict:
        best_split: dict = {}
        max_info_gain = -float('inf')

        for feature_id in range(n_features):
            for threshold in np.unique(dataset[:, feature_id]):
                left_data, right_data = self.split(dataset, feature_id, threshold)

                if len(left_data) <= 0 or len(right_data) <= 0:
                    continue

                info_gain = self.cal_info_gain(dataset[:, -1], left_data[:, -1], right_data[:, -1])

                if max_info_gain < info_gain:
                    max_info_gain = info_gain
                    best_split['info_gain'] = info_gain
                    best_split['threshold'] = threshold
                    best_split['feature_id'] = feature_id
                    best_split['left_data'] = left_data
                    best_split['right_data'] = right_data

        return best_split

    def split(self, dataset: np.array, feature_id: int, threshold: float):
        left_data = np.array([row for row in dataset if row[feature_id] <= threshold])
        right_data = np.array([row for row in dataset if row[feature_id] > threshold])
        return left_data, right_data

    def cal_info_gain(self, dataset:np.array, left: np.array, right: np.array)-> float:
        wl = len(left) / len(dataset)
        wr = len(right) / len(dataset)
        if self.mode=='entropy':
            return self.cal_entropy(dataset) - wl * self.cal_entropy(left) - wr * self.cal_entropy(right)
        else:
            return self.cal_gini(dataset) - wl * self.cal_gini(left) - wr * self.cal_gini(right)

    def cal_entropy(self, Y: np.array)->float:
        entropy = 0
        for cls in np.unique(Y):
            p = len(Y[Y == cls]) / len(Y)
            entropy += -p * np.log2(p)
        return entropy

    def cal_gini(self, Y: np.array)->float:
        gini = 1
        for cls in np.unique(Y):
            p = len(Y[Y == cls]) / len(Y)
            gini-=p**2
        return gini

    def cal_leaf_val(self, Y: np.array)->any:
        Y = list(Y)
        return max(Y, key=Y.count)

    def fit(self, X, y):
        dataset = np.concatenate((X, y.reshape(-1, 1)), axis=1)
        self.root = self.build(dataset)

    def predict(self, X):
        return np.array([self.make_pred(x, self.root) for x in X])

    def make_pred(self, x, node: Node):
        if node.val is not None:
            return node.val
        if x[node.feature_id] <= node.threshold:
            return self.make_pred(x, node.left)
        return self.make_pred(x, node.right)


In [133]:
df=pd.read_csv('Iris.csv')
df.dropna()
X = df.iloc[:, 1:-1].values
Y = df.iloc[:, -1].values.reshape(-1, 1)

classifier=DecisionTree(3, 5, 'gini')

In [134]:
classifier.fit(X, Y)

In [135]:
y_pred=classifier.predict(X)

In [136]:
y_pred=y_pred.reshape(-1, 1)

In [138]:
from sklearn.metrics import accuracy_score, confusion_matrix

print(confusion_matrix(Y, y_pred))
print(accuracy_score(Y, y_pred)*100, '%')


[[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]
100.0 %
