In [106]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset from a CSV file
data = pd.read_csv('data_banknote_authentication.csv')

# Split the data into training and testing sets
train_data = data.sample(frac=0.7)
test_data = data.drop(train_data.index)

# Convert DataFrames to NumPy arrays
train_array = train_data.values
test_array = test_data.values

# Separate features and labels for training and testing sets
features_train = train_array[:, 0:4]
labels_train = train_array[:, -1]
features_test = test_array[:, 0:4]
labels_test = test_array[:, -1]


In [107]:
from collections import Counter

def count_occurrences(lst):
    return Counter(lst)

def gini_index(regions, classes):
    total_gini = 0

    for region in regions:
        region_labels = [point[-1] for point in region]
        label_counts = count_occurrences(region_labels)
        n = len(region_labels)

        if n == 0:
            continue

        gini = 1
        for class_label in classes:
            class_count = label_counts.get(class_label, 0)
            gini -= (class_count / n) ** 2

        total_gini += n * gini

    return total_gini

In [108]:
def split_at_root_node(feature_index, value, dataset):
    dataset = np.array(dataset)
    
    mask = dataset[:, feature_index] > value
    L = dataset[~mask]
    R = dataset[mask]

    return list(map(list, L)), list(map(list, R))

In [109]:
def get_best_split(dataset):
    # Find the different classes present in the dataset
    classes = list(set(row[-1] for row in dataset))

    # Initialize variables for the best split
    best_split = {'index': None, 'value': None, 'gini_score': float('inf'), 'regions': None}

    # Loop over each input feature index
    for feature_index in range(len(dataset[0]) - 1):
        # Extract unique values of the current feature
        unique_values = set(row[feature_index] for row in dataset)

        # Loop over each unique value of the current feature
        for value in unique_values:
            left, right = split_at_root_node(feature_index, value, dataset)
            
            # Calculate Gini index for this split
            regions = [left, right]
            gini_score = gini_index(regions, classes)

            # Check if this split is better than the current best split
            if gini_score < best_split['gini_score']:
                best_split = {'index': feature_index, 'value': value, 'gini_score': gini_score, 'regions': regions}

    # Return the best split
    return best_split



In [110]:
class TreeNode:
    def __init__(self, index=None, value=None, is_leaf=False, outcome=None):
        self.index = index
        self.value = value
        self.is_leaf = is_leaf
        self.outcome = outcome
        self.left = None
        self.right = None

def leaf_output(region):
    if not region:
        return None
    
    outcomes = [row[-1] for row in region]
    
    if not outcomes:
        return None
    
    return max(set(outcomes), key=outcomes.count)

def build_tree(train, max_depth, min_size):
    root = create_node(train, max_depth, min_size, 1)
    return root

def create_node(dataset, max_depth, min_size, depth):
    if depth >= max_depth or len(dataset) <= min_size:
        return TreeNode(is_leaf=True, outcome=leaf_output(dataset))

    best_split = get_best_split(dataset)
    node = TreeNode(index=best_split['index'], value=best_split['value'])
    
    left, right = split_at_root_node(best_split['index'], best_split['value'], dataset)
    node.left = create_node(left, max_depth, min_size, depth + 1)
    node.right = create_node(right, max_depth, min_size, depth + 1)
    
    return node

def print_tree(node, depth=0):
    if node.is_leaf:
        print('%s[%s]' % ((depth*' ', node.outcome)))
    else:
        print('%s[X%d < %.3f]' % ((depth*' ', (node.index + 1), node.value)))
        print_tree(node.left, depth + 1)
        print_tree(node.right, depth + 1)

def predict(node, row):
    if node.is_leaf:
        return node.outcome
    else:
        if row[node.index] < node.value:
            return predict(node.left, row)
        else:
            return predict(node.right, row)



In [111]:
data=np.array(train)
tree = build_tree(data, 5, 10)

In [112]:
print_tree(tree)

[X1 < 0.301]
 [X2 < 7.122]
  [X1 < -0.475]
   [X3 < 6.716]
    [1.0]
    [1.0]
   [X2 < 5.010]
    [1.0]
    [0.0]
  [X1 < -4.478]
   [X1 < -6.368]
    [1.0]
    [1.0]
   [X1 < 0.300]
    [0.0]
    [None]
 [X3 < -4.404]
  [X1 < 2.392]
   [X1 < 0.895]
    [1.0]
    [1.0]
   [0.0]
  [X1 < 1.590]
   [X3 < -1.786]
    [1.0]
    [0.0]
   [X1 < 1.742]
    [0.0]
    [0.0]


In [113]:
def calculate_accuracy(tree, X, y):
    correct_predictions = sum(predict(tree, row) == label for row, label in zip(X, y))
    accuracy = correct_predictions / len(X) * 100
    return accuracy
print("Accuracy at training dataset: ",accuracy(tree,features_train,labels_train))
print("Accuracy at test dataset: ",accuracy(tree,features_test,labels_test)) 

Accuracy at training dataset:  94.58333333333333
Accuracy at test dataset:  96.83698296836984
