In [281]:
import pandas as pd
import numpy as np

In [282]:
class Node:
    def __init__(self, label=None, attribute=None, children=None):
        self.label = label
        self.attribute = attribute
        self.children = children

In [283]:
class DecisionTree:
    def __init__(self, attributes, max_depth=3):
        self.max_depth = max_depth
        self.attributes = attributes
        self.tree = None
        self.labels = []

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def print_tree(self):
        self._print_tree(self.tree)

    def _build_tree(self, X, y, depth=0, used_attributes=[]):
        num_samples, num_attributes = X.shape
        num_classes = len(np.unique(y))

        # base case
        if depth >= self.max_depth or num_classes == 1 or num_samples < 2 or num_attributes == 0:
            label = max(y, key=list(y).count)
            self.labels.append(label)
            return Node(label=label)

        # greedy search
        best_attribute = self._best_attribute(X, y, used_attributes)

        # partition
        children = {}
        for value in np.unique(X[:, best_attribute]):
            X_subset = X[X[:, best_attribute] == value]
            y_subset = y[X[:, best_attribute] == value]
            children[value] = self._build_tree(X_subset, y_subset, depth + 1, used_attributes + [best_attribute])
        
        return Node(attribute=best_attribute, children=children)
    
    def _best_attribute(self, X, y, used_attributes):
        best_attribute = None
        min_entropy = float('inf')

        for attribute in range(X.shape[1]):
            if attribute in used_attributes:
                continue
            entropy = self._weighted_sum(attribute, y)
            if entropy < min_entropy:
                min_entropy = entropy
                best_attribute = attribute

        return best_attribute

    def _entropy(self, y):
        num_samples = len(y)
        entropy = 0

        for class_ in np.unique(y):
            p = len(y[y == class_]) / num_samples
            entropy += -p * np.log2(p)

        return entropy

    def _weighted_sum(self, attribute, y):
        num_samples = len(y)
        weighted_sum = 0

        for value in np.unique(attribute):
            y_subset = y[attribute == value]
            weighted_sum += (len(y_subset) / num_samples) * self._entropy(y_subset)

        return weighted_sum

    def _predict(self, inputs):
        node = self.tree

        while node.attribute is not None:
            attribute = node.attribute
            if inputs[attribute] not in node.children:
                # if the value is not in the tree, return the most common label
                return max(self.labels, key=self.labels.count)
            node = node.children[inputs[attribute]]

        return node.label

    def _print_tree(self, node, spacing=""):
        if node.attribute is not None:
            print(spacing + "Attribute:", self.attributes[node.attribute])
            for value, child in node.children.items():
                print(spacing + '--> Value:', value)
                self._print_tree(child, spacing + "  ")
        else:
            print(spacing + "Predict", node.label)


In [284]:
# load data
df = pd.read_csv('iris.csv')
df.head()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [285]:
# bin data
df['sepallength'] = pd.cut(df['sepallength'], bins=3, labels=['short', 'medium', 'long'])
df['sepalwidth'] = pd.cut(df['sepalwidth'], bins=3, labels=['short', 'medium', 'long'])
df['petallength'] = pd.cut(df['petallength'], bins=3, labels=['short', 'medium', 'long'])
df['petalwidth'] = pd.cut(df['petalwidth'], bins=3, labels=['short', 'medium', 'long'])

df.head()

df.to_csv('preprocessed_iris.csv', index=False)

In [286]:
train = df.sample(frac=0.8, random_state=20)
test = df.drop(train.index)

train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

train_X = train[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']].values
train_y = train['class'].values

test_X = test[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']].values
test_y = test['class'].values

In [287]:
# model
dt = DecisionTree(attributes=['sepallength', 'sepalwidth', 'petallength', 'petalwidth'])
dt.fit(train_X, train_y)
dt.print_tree()

Attribute: sepallength
--> Value: long
  Attribute: sepalwidth
  --> Value: long
    Predict Iris-virginica
  --> Value: medium
    Attribute: petallength
    --> Value: long
      Predict Iris-virginica
    --> Value: medium
      Predict Iris-versicolor
  --> Value: short
    Attribute: petallength
    --> Value: long
      Predict Iris-virginica
    --> Value: medium
      Predict Iris-versicolor
--> Value: medium
  Attribute: sepalwidth
  --> Value: long
    Predict Iris-setosa
  --> Value: medium
    Attribute: petallength
    --> Value: long
      Predict Iris-virginica
    --> Value: medium
      Predict Iris-versicolor
  --> Value: short
    Attribute: petallength
    --> Value: long
      Predict Iris-virginica
    --> Value: medium
      Predict Iris-versicolor
--> Value: short
  Attribute: sepalwidth
  --> Value: long
    Predict Iris-setosa
  --> Value: medium
    Attribute: petallength
    --> Value: medium
      Predict Iris-versicolor
    --> Value: short
      Predict I

In [288]:
# predict
predictions = dt.predict(test_X)

# accuracy
accuracy = np.sum(predictions == test_y) / len(test_y)
print('Accuracy:', accuracy)

# confusion matrix
pd.crosstab(test_y, predictions, rownames=['Actual'], colnames=['Predicted'])

Accuracy: 0.9333333333333333


Predicted,Iris-setosa,Iris-versicolor,Iris-virginica
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,13,0,0
Iris-versicolor,0,4,2
Iris-virginica,0,0,11


In [289]:
from sklearn.tree import DecisionTreeClassifier

In [290]:
# replace labels with ints
train_X = train[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']].replace({'short': 0, 'medium': 1, 'long': 2}).values
test_X = test[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']].replace({'short': 0, 'medium': 1, 'long': 2}).values

In [291]:
model = DecisionTreeClassifier(max_depth=3, criterion='entropy')
model.fit(train_X, train_y)

In [292]:
predictions = model.predict(test_X)

# accuracy
accuracy = np.sum(predictions == test_y) / len(test_y)
print('Accuracy:', accuracy)

# confusion matrix
pd.crosstab(test_y, predictions, rownames=['Actual'], colnames=['Predicted'])

Accuracy: 0.9333333333333333


Predicted,Iris-setosa,Iris-versicolor,Iris-virginica
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,13,0,0
Iris-versicolor,0,4,2
Iris-virginica,0,0,11
