# Extremely Fast Decision Tree Implementation

In [5]:
import numpy as np

class Node:
    def __init__(self, is_leaf=True, prediction=None):
        self.is_leaf = is_leaf
        self.prediction = prediction
        self.split_feature = None
        self.split_value = None
        self.children = {}
        self.class_counts = np.zeros(2)  # For binary classification

def hoeffding_bound(R, n, delta=0.05):
    return np.sqrt((R**2 * np.log(1/delta)) / (2 * n))

def best_split(data, labels):
    features = data.shape[1]
    best_split_feature = None
    best_split_value = None
    best_split_score = -np.inf
    n = len(data)
    
    for feature in range(features):
        values = np.sort(np.unique(data[:, feature]))
        for i in range(len(values) - 1):
            split_value = (values[i] + values[i+1]) / 2
            left_mask = data[:, feature] <= split_value
            right_mask = ~left_mask
            
            left_count = np.bincount(labels[left_mask], minlength=2)
            right_count = np.bincount(labels[right_mask], minlength=2)
            score = -np.sum(left_count**2) / np.sum(left_count) - np.sum(right_count**2) / np.sum(right_count)
            
            if score > best_split_score:
                best_split_feature = feature
                best_split_value = split_value
                best_split_score = score
                
    return best_split_feature, best_split_value

class EFDT:
    def __init__(self, delta=0.05):
        self.root = Node(is_leaf=True, prediction=0)
        self.delta = delta
    
    def fit(self, X, y):
        #Adjust to handle 0 and 1
        y_adjusted = y - 1

        for xi, yi in zip(X, y_adjusted):
            self._fit_single(xi, yi)
    
    def _fit_single(self, x, y):
        node = self.root
        while not node.is_leaf:
            if x[node.split_feature] <= node.split_value:
                node = node.children['left']
            else:
                node = node.children['right']
        
        node.class_counts[y] += 1
        total = np.sum(node.class_counts)
        node.prediction = np.argmax(node.class_counts)
        
        if total > 20:  # Arbitrary threshold to start considering splits
            self._attempt_to_split(node, x, y)
    
    def _attempt_to_split(self, node, x, y):
        X_sub = np.array([x])  # Placeholder for actual subset for the node
        y_sub = np.array([y])  # Placeholder for actual subset labels for the node
        
        feature, value = best_split(X_sub, y_sub)
        if feature is not None:
            R = 1  # Binary classification
            n = np.sum(node.class_counts)
            epsilon = hoeffding_bound(R, n, self.delta)
            
            # Check if the split is significant
            if epsilon < 0.1: 
                node.is_leaf = False
                node.split_feature = feature
                node.split_value = value
                node.children['left'] = Node(is_leaf=True, prediction=np.argmax(node.class_counts))
                node.children['right'] = Node(is_leaf=True, prediction=np.argmax(node.class_counts))
    
    def predict(self, X):
        preds = []
        for x in X:
            node = self.root
            while not node.is_leaf:
                if x[node.split_feature] <= node.split_value:
                    node = node.children['left']
                else:
                    node = node.children['right']
                    
            # Turn it back to 1 and 2
            preds.append(node.prediction + 1)
        return np.array(preds)



## Load the Dataset

In [6]:
import numpy as np

# Assuming your file is named 'data.txt' and located in the current directory
file_path = 'Skin_NonSkin 2.txt'

# Load the data
data = np.loadtxt(file_path, delimiter='\t')

# Split the data into features and target variable
X = data[:, :-1]
y = data[:, -1].astype(int) 



## Evaluate the Results

In [7]:
from sklearn.metrics import accuracy_score

model = EFDT()
model.fit(X, y)
predictions = model.predict(X)
actual_labels = y

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predictions)

print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 79.25%
