# 1. Load Dataset (Using Only CSV Reading, No ML Libraries)

In [1]:
import csv

def load_csv(path):
    data = []
    with open(path, 'r') as f:
        reader = csv.reader(f, delimiter=';')   # IMPORTANT FIX
        header = next(reader)                  
        for row in reader:
            row = [float(x) for x in row]       # convert values
            data.append(row)
    return data

dataset = load_csv("cardio_train.csv")
print("Rows:", len(dataset))
print("Cols:", len(dataset[0]))

Rows: 70000
Cols: 13


# 2. How Decision Tree Works (Simple Explanation)

# A Decision Tree:

Chooses the best feature that splits data into pure groups

Uses Gini Impurity or Entropy to measure split quality

Repeats until:

Max depth reached

Minimum samples in leaf

Groups are pure

Formula (Gini):
# G = 1 -((p0²)+((p1²))

Where p0 and p1 are class proportions.

# 3. Decision Tree Implementation (From Scratch)
# Step 1: Gini impurity

In [2]:
def gini(groups):
    total = sum(len(g) for g in groups)
    score = 0.0
    for group in groups:
        size = len(group)
        if size == 0:
            continue
        labels = [row[-1] for row in group]
        p0 = labels.count(0) / size
        p1 = labels.count(1) / size
        score += (size / total) * (1 - (p0*p0 + p1*p1))
    return score


# Step 2: Split by feature + threshold

In [3]:
def split(index, value, dataset):
    left, right = [], []
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right


# Step 3: Find best split

In [4]:
def best_split(dataset):
    best_index, best_value, best_score, best_groups = None, None, 999, None
    n_features = len(dataset[0]) - 1

    for index in range(n_features):
        for row in dataset:
            groups = split(index, row[index], dataset)
            score = gini(groups)
            if score < best_score:
                best_index, best_value, best_score, best_groups = index, row[index], score, groups

    return {
        "index": best_index,
        "value": best_value,
        "groups": best_groups
    }


# Step 4: Create a leaf node

In [5]:
def leaf(group):
    labels = [row[-1] for row in group]
    return max(set(labels), key=labels.count)


# Step 5: Build the full tree (recursive)

In [6]:
def build_tree(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])

    # If one group empty → leaf
    if not left or not right:
        node['left'] = node['right'] = leaf(left + right)
        return
    
    # If max depth reached → leaf nodes
    if depth >= max_depth:
        node['left'], node['right'] = leaf(left), leaf(right)
        return
    
    # Left Node
    if len(left) <= min_size:
        node['left'] = leaf(left)
    else:
        node['left'] = best_split(left)
        build_tree(node['left'], max_depth, min_size, depth+1)

    # Right Node
    if len(right) <= min_size:
        node['right'] = leaf(right)
    else:
        node['right'] = best_split(right)
        build_tree(node['right'], max_depth, min_size, depth+1)


# Step 6: Prediction

In [7]:
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']


# 4. Train + Test Split + Accuracy

In [8]:
def train_test_split(data, split=0.8):
    cut = int(len(data) * split)
    return data[:cut], data[cut:]

train, test = train_test_split(dataset[:1000])

root = best_split(train)
build_tree(root, max_depth=5, min_size=10, depth=1)

correct = 0
for row in test:
    y_true = row[-1]
    y_pred = predict(root, row)
    if y_true == y_pred:
        correct += 1

accuracy = correct / len(test)
print("Accuracy:", accuracy)


Accuracy: 0.54


# 5. Check Overfitting / Underfitting

In [None]:
➤ Overfitting if:

Training accuracy = 95%+

Test accuracy is very low (e.g., < 70%)

➤ Underfitting if:

Training accuracy and Test accuracy both low (e.g., < 60%)

You can compute training accuracy like test accuracy.

# 6. Hyperparameter Tuning

In [None]:
for depth in [3, 5, 7, 10]:
    for min_size in [5, 10, 20]:
        root = best_split(train)
        build_tree(root, max_depth=depth, min_size=min_size, depth=1)

        correct = 0
        for row in test:
            if predict(root, row) == row[-1]:
                correct += 1

        print(depth, min_size, "Acc:", correct/len(test))


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# ----------------------------------------------------
# 1. Load dataset (first 1000 rows)
# ----------------------------------------------------
df = pd.read_csv("cardio_train.csv", sep=";")
df_1000 = df.head(1000)

# ----------------------------------------------------
# 2. Split features/label
# ----------------------------------------------------
X = df_1000.drop("cardio", axis=1)
y = df_1000["cardio"]

# ----------------------------------------------------
# 3. Train-test split (20% test)
# ----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# ----------------------------------------------------
# 4. Create Decision Tree Model (best hyperparameters)
# ----------------------------------------------------
model = DecisionTreeClassifier(
    max_depth=5,            # controls overfitting
    min_samples_split=10,   # stable splitting
    random_state=42
)

# ----------------------------------------------------
# 5. Train model
# ----------------------------------------------------
model.fit(X_train, y_train)

# ----------------------------------------------------
# 6. Predict on test data
# ----------------------------------------------------
y_pred = model.predict(X_test)

# ----------------------------------------------------
# 7. Compute accuracy
# ----------------------------------------------------
accuracy = accuracy_score(y_test, y_pred)
print("Final Accuracy on 1000 rows:", accuracy)


Final Accuracy on 1000 rows: 0.7
