>Imports, load and preprocess, DecisionTree class

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split

df = pd.read_csv('titanic.csv')

df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

X = df[['Pclass',
        'Sex',
        'Age',
        'Siblings/Spouses Aboard',
        'Parents/Children Aboard',
        'Fare']].values
y = df['Survived'].values

class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=5):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    class Node:
        def __init__(self, gini, n_samples, n_per_class, pred_class):
            self.gini = gini
            self.n_samples = n_samples
            self.n_per_class = n_per_class
            self.predicted_class = pred_class
            self.feature_index = None
            self.threshold = None
            self.left = None
            self.right = None

    def _gini(self, y):
        m = len(y)
        return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in [0, 1])

    def _best_split(self, X, y):
        m, n = X.shape
        if m < self.min_samples_split:
            return None, None
        best_gini, best_idx, best_thr = 1.0, None, None
        for idx in range(n):
            # sort by feature idx
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            left_counts = [0, 0]
            right_counts = Counter(classes)
            for i in range(1, m):
                c = classes[i - 1]
                left_counts[c] += 1
                right_counts[c] -= 1
                gini_l = 1.0 - sum((left_counts[x] / i) ** 2 for x in [0, 1])
                gini_r = 1.0 - sum((right_counts[x] / (m - i)) ** 2 for x in [0, 1])
                g = (i * gini_l + (m - i) * gini_r) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if g < best_gini:
                    best_gini, best_idx = g, idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr

    def _build_tree(self, X, y, depth=0):
        counts = [np.sum(y == i) for i in [0, 1]]
        pred = np.argmax(counts)
        node = DecisionTree.Node(self._gini(y), y.size, counts, pred)
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                mask = X[:, idx] < thr
                X_l, y_l = X[mask], y[mask]
                X_r, y_r = X[~mask], y[~mask]
                if len(y_l) and len(y_r):
                    node.feature_index, node.threshold = idx, thr
                    node.left  = self._build_tree(X_l, y_l, depth + 1)
                    node.right = self._build_tree(X_r, y_r, depth + 1)
        return node

    def fit(self, X, y):
        self.tree_ = self._build_tree(X, y)

    def _predict_one(self, x):
        node = self.tree_
        while node.left:
            node = node.left if x[node.feature_index] < node.threshold else node.right
        return node.predicted_class

    def predict(self, X):
        return [self._predict_one(x) for x in X]


>Train models for each split and collect predictions

In [3]:
splits = {'50:50': 0.50, '70:30': 0.70, '80:20': 0.80}
results = {}  

for name, train_size in splits.items():
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=train_size, random_state=42)
    clf = DecisionTree(max_depth=5)
    clf.fit(X_tr, y_tr)
    y_pr = clf.predict(X_te)
    results[name] = (y_te, y_pr)

>Compute Accuracy, Precision, Recall, F1 for each split

In [4]:


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

for name, (y_true, y_pred) in results.items():
    print(f"--- {name} Split Metrics ---")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print()


--- 50:50 Split Metrics ---
Accuracy: 0.7927927927927928
Precision: 0.8524590163934426
Recall: 0.5842696629213483
F1 Score: 0.6933333333333334

--- 70:30 Split Metrics ---
Accuracy: 0.797752808988764
Precision: 0.7831325301204819
Recall: 0.6435643564356436
F1 Score: 0.7065217391304348

--- 80:20 Split Metrics ---
Accuracy: 0.797752808988764
Precision: 0.7818181818181819
Recall: 0.6417910447761194
F1 Score: 0.7049180327868853



>Print confusion matrix for each split

In [5]:

from sklearn.metrics import confusion_matrix
for name, (y_true, y_pred) in results.items():
    print(f"--- {name} Split Confusion Matrix ---")
    print(confusion_matrix(y_true, y_pred))
    print()


--- 50:50 Split Confusion Matrix ---
[[248  18]
 [ 74 104]]

--- 70:30 Split Confusion Matrix ---
[[148  18]
 [ 36  65]]

--- 80:20 Split Confusion Matrix ---
[[99 12]
 [24 43]]

