In [20]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris(as_frame=True)
df = iris.frame
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [21]:
class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, *, value=None):
        self.feature_index = feature_index  # Index of the feature to split on
        self.threshold = threshold          # Threshold value for the split
        self.left = left                    # Left child node
        self.right = right                  # Right child node
        self.value = value                  # Class label if this is a leaf node

In [22]:
def gini_impurity(y):# y represents the labels(target values) of the data points
    lis = set(y)  # Unique classes in the labels
    impurity = 1.0
    total = len(y)

    for ind in lis:
        p = sum(y == ind) / total
        impurity -= p ** 2
    return impurity


In [23]:
def best_split(x, y):
    best_gini = float('inf')
    best_idx = None
    best_threshold = None

    n_sample, n_features = x.shape

    for feat in range(n_features):
        thresholds = set(x.iloc[:, feat])
        for threshold in thresholds:
            left_tree = x.iloc[:, feat] <= threshold
            right_tree = x.iloc[:, feat] > threshold
            if len(y[left_tree]) == 0 or len(y[right_tree]) == 0:
                continue
            gini_left = gini_impurity(y[left_tree])
            gini_right = gini_impurity(y[right_tree])
            weighted_gini = (len(y[left_tree]) * gini_left + len(y[right_tree]) * gini_right) / n_sample

            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_idx = feat
                best_threshold = threshold
    return best_idx, best_threshold

In [24]:
def build_tree(X, y, depth=0, max_depth=None, min_samples_split=2):
    # Stopping conditions
    if len(set(y)) == 1:  # All labels are the same
        return TreeNode(value=y.iloc[0])
    if max_depth is not None and depth >= max_depth:
        return TreeNode(value=y.mode()[0])
    if len(y) < min_samples_split:
        return TreeNode(value=y.mode()[0])

    # Find best split
    feature_idx, threshold = best_split(X, y)
    if feature_idx is None:
        return TreeNode(value=y.mode()[0])

    # Split the data
    left_mask = X.iloc[:, feature_idx] <= threshold
    right_mask = ~left_mask

    left = build_tree(X[left_mask], y[left_mask], depth + 1, max_depth, min_samples_split)
    right = build_tree(X[right_mask], y[right_mask], depth + 1, max_depth, min_samples_split)

    return TreeNode(feature_index=feature_idx, threshold=threshold, left=left, right=right)

In [None]:
def predict_one(node, x):
    if node.value is not None:
        return node.value
    if x.iloc[node.feature_index] <= node.threshold:
        return predict_one(node.left, x)
    else:
        return predict_one(node.right, x)

def predict(tree, X):
    # tree: root TreeNode
    # X: DataFrame or 2D array of samples
    return [predict_one(tree, row) for _, row in X.iterrows()]

In [26]:
# Prepare data
X = df.drop(columns='target')
y = df['target']

# Optionally, split into train/test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the tree
tree = build_tree(X_train, y_train, max_depth=3)

# Predict on test set
y_pred = predict(tree, X_test)

# Evaluate accuracy
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0
