In [17]:
import numpy as np
import pandas as pd

In [18]:
def gini_impurity(y):
    classes, counts = np.unique(y, return_counts=True)
    prob = counts / counts.sum()
    return 1 - np.sum(prob**2)

In [19]:
def gini_split(feature, threshold, y):
    left_idx = feature < threshold
    right_idx = feature >= threshold
    
    y_left = y[left_idx]
    y_right = y[right_idx]
    
    if len(y_left) == 0 or len(y_right) == 0:
        return 999  # invalid split
    
    total = len(y)
    g_left = (len(y_left) / total) * gini_impurity(y_left)
    g_right = (len(y_right) / total) * gini_impurity(y_right)
    return g_left + g_right

In [20]:
def best_split(X, y):
    best_feature = None
    best_threshold = None
    best_gini = 999
    
    for col in range(X.shape[1]):
        values = np.unique(X[:, col])
        for t in values:
            gini = gini_split(X[:, col], t, y)
            if gini < best_gini:
                best_gini = gini
                best_feature = col
                best_threshold = t
    
    return best_feature, best_threshold

In [21]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, label=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.label = label

In [22]:
def build_tree(X, y, depth=0, max_depth=5):
    # If pure node or depth reached max
    if len(np.unique(y)) == 1:
        return Node(label=y[0])
    
    if depth == max_depth:
        values, counts = np.unique(y, return_counts=True)
        return Node(label=values[np.argmax(counts)])
    
    feature, threshold = best_split(X, y)
    if feature is None:
        values, counts = np.unique(y, return_counts=True)
        return Node(label=values[np.argmax(counts)])
    
    left_idx = X[:, feature] < threshold
    right_idx = X[:, feature] >= threshold
    
    left_child = build_tree(X[left_idx], y[left_idx], depth+1, max_depth)
    right_child = build_tree(X[right_idx], y[right_idx], depth+1, max_depth)
    
    return Node(feature, threshold, left_child, right_child)

In [23]:
def predict_one(node, x):
    while node.label is None:
        if x[node.feature] < node.threshold:
            node = node.left
        else:
            node = node.right
    return node.label

def predict(tree, X):
    return np.array([predict_one(tree, x) for x in X])

In [24]:
df = pd.read_csv("cardio_train.csv", sep=";")

df=df.head(1000)

# Correct target name
X = df.drop("cardio", axis=1).values
y = df["cardio"].values

# Train-test split manually
split = int(0.8 * len(df))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Build decision tree
tree = build_tree(X_train, y_train, max_depth=5)

# Predictions
y_pred = predict(tree, X_test)

# Accuracy
accuracy = np.sum(y_pred == y_test) / len(y_test)
print("Accuracy:", accuracy)


Accuracy: 0.755
