In [1]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import math
from matplotlib import pyplot as plt

In [2]:
class Node():
    def __init__(self, feature_name = None, threshold = None, left = None, right = None, is_leaf = False, gain = None, value = None):
        self.feature_name = feature_name
        self.threshold = threshold
        self.left = left
        self.right = right
        self.is_leaf = is_leaf
        self.gain = gain
        self.value = value

In [3]:
class DecisionTree():
    def __init__(self, n_min = 5):
        self.n_min = n_min
        self.root = None

    def compute_entropy(self, y):
        counts = np.bincount(y.astype(int))
        probabilities = counts / len(y)
        entropy = 0

        for p in probabilities:
            if p > 0:
                entropy += -p * math.log2(p)

        return entropy

    def compute_information_gain(self, feature_column, y, threshold):
        original_entropy = self.compute_entropy(y)

        left_mask = (feature_column <= threshold)
        right_mask = (feature_column > threshold)

        y_left = y[left_mask]
        y_right = y[right_mask]

        n = len(y)
        n_left = len(y_left)
        n_right = len(y_right)

        if n_left == 0 or n_right == 0:
            return 0  # If one child is empty, no gain.

        child_entropy = ((n_left / n) * self.compute_entropy(y_left)) + ((n_right / n) * self.compute_entropy(y_right))
        return original_entropy - child_entropy

    def choose_best_split(self, X, y):
        best_gain = -1
        best_feature_index = None
        best_threshold = None

        n_samples, n_features = X.shape  

        for feature_index in range(n_features):
            values = np.unique(X[:, feature_index])
            values.sort()

            if len(values) < 2:
                continue
            
            for i in range(len(values) - 1):
                threshold = (values[i] + values[i + 1]) / 2  
                gain = self.compute_information_gain(X[:, feature_index], y, threshold)

                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def create_tree(self, X, y):
        if len(np.unique(y)) == 1 or len(y) < self.n_min:
            counts = np.bincount(y.astype(int))
            majority_label = np.argmax(counts)
            return Node(value=majority_label, is_leaf=True)

        best_feature, best_threshold = self.choose_best_split(X, y)

        if best_feature is None:
            counts = np.bincount(y.astype(int))
            majority_label = np.argmax(counts)
            return Node(value = majority_label, is_leaf = True)

        node = Node(feature_name = best_feature, threshold = best_threshold, is_leaf = False)

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        X_left, y_left = X[left_indices], y[left_indices]
        X_right, y_right = X[right_indices], y[right_indices]

        if len(y_left) == 0 or len(y_right) == 0:
            counts = np.bincount(y.astype(int))
            majority_label = np.argmax(counts)
            return Node(value=majority_label, is_leaf=True)

        node.left = self.create_tree(X_left, y_left)
        node.right = self.create_tree(X_right, y_right)

        return node

    def fit(self, X, y):
        self.root = self.create_tree(X, y)

    def predict_sample(self, x, node):
        if node.is_leaf:
            return node.value
        else:
            if x[node.feature_name] <= node.threshold:
                return self.predict_sample(x, node.left)
            else:
                return self.predict_sample(x, node.right)

    def predict(self, X):
        n = X.shape[0]
        predictions = []
        
        for i in range(n):
            sample = X[i]
            prediction = self.predict_sample(sample, self.root)
            predictions.append(prediction)
        
        return np.array(predictions)

In [4]:
iris_df = pd.read_csv("iris.csv", names=["sepal_length", "sepal_width", "petal_length", "petal_width", "class"])
with pd.option_context('future.no_silent_downcasting', True):
    iris_df= iris_df.replace({'Iris-setosa':0, 'Iris-versicolor': 1, 'Iris-virginica': 2}).infer_objects()

X = iris_df.iloc[:, :-1].values  
y = iris_df.iloc[:, -1].values  

In [8]:
kf = KFold(n_splits = 10)
n_min_vals = {5, 10, 15, 20}
results = []

for n_min in n_min_vals:
    accuracy_vals = []
    for train_index, test_index in kf.split(X):
        # Splitting the dataset into train set and test set
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train Decision Tree with current n_min
        tree = DecisionTree(n_min = n_min)
        tree.fit(X_train, y_train)

        # Make predictions
        y_pred = tree.predict(X_test)

        # Compute accuracy
        acc = accuracy_score(y_test, y_pred)
        accuracy_vals.append(acc)

    # Compute mean and standard deviation of accuracy
    avg_accuracy = np.mean(accuracy_vals)
    std_accuracy = np.std(accuracy_vals)

    results.append({"n_min": n_min, "Mean Accuracy": avg_accuracy, "Std Dev": std_accuracy})

In [9]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,n_min,Mean Accuracy,Std Dev
0,10,0.933333,0.078881
1,20,0.933333,0.078881
2,5,0.913333,0.123108
3,15,0.933333,0.078881


In [10]:
spambase_df = pd.read_csv("spambase.csv")

X = spambase_df.iloc[:, :-1].values  
y = spambase_df.iloc[:, -1].values

In [None]:
kf = KFold(n_splits = 10)
n_min_vals = {5, 10, 15, 20, 25}
results = []

for n_min in n_min_vals:
    accuracy_vals = []
    for train_index, test_index in kf.split(X):
        # Splitting the dataset into train set and test set
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train Decision Tree with current n_min
        tree = DecisionTree(n_min = n_min)
        tree.fit(X_train, y_train)

        # Make predictions
        y_pred = tree.predict(X_test)

        # Compute accuracy
        acc = accuracy_score(y_test, y_pred)
        accuracy_vals.append(acc)

    # Compute mean and standard deviation of accuracy
    avg_accuracy = np.mean(accuracy_vals)
    std_accuracy = np.std(accuracy_vals)

    results.append({"n_min": n_min, "Mean Accuracy": avg_accuracy, "Std Dev": std_accuracy})

In [None]:
results_df = pd.DataFrame(results)
results_df