In [180]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


data = pd.read_csv('train_and_test2.csv')
data = data.drop(columns=['zero'])
for i in range(1, 19):
    data = data.drop(columns=[f'zero.{i}'])
data.rename(columns={'2urvived': 'Survived'}, inplace=True)


class DecisionTree:
    
     #use Gini  or Entropy to measure split quality. Recursively splits the dataset to maximize information gain.
 
    class Node:
        def __init__(self, *, feature_index=None, threshold=None, left=None, right=None, value=None):
            self.feature_index = feature_index
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def __init__(self, criterion='gini', max_depth=None, min_samples_leaf=1):
        if criterion not in ('gini', 'entropy'):
            raise ValueError("criterion must be 'gini' or 'entropy'")
        self.criterion = criterion
        self.max_depth = float('inf') if max_depth is None else int(max_depth)
        self.min_samples_leaf = max(1, int(min_samples_leaf))
        self.tree_ = None

    def _gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probs = counts / counts.sum()
        return 1.0 - np.sum(probs ** 2)

    def _entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probs = counts / counts.sum()
        probs = probs[probs > 0]
        return -np.sum(probs * np.log2(probs))

    def _impurity(self, y):
        return self._gini(y) if self.criterion == 'gini' else self._entropy(y)

    def _majority_class(self, y):
        counts = np.bincount(y)
        return np.argmax(counts)
    
#identify the best split we can do by splitting data to left and right node
    def _best_split(self, X, y):
        m, n_features = X.shape
        if m < 2 * self.min_samples_leaf:
            return None, None, None, None, 0.0

        parent_impurity = self._impurity(y)
        best_gain = 0.0
        best_feat = None
        best_thresh = None
        best_left_mask = None
        best_right_mask = None

        max_features = max(1, int(np.sqrt(n_features)))
        feature_indices = np.random.choice(n_features, max_features, replace=False)

        for feat in feature_indices:
            values = X[:, feat]
            sorted_idx = np.argsort(values)
            sorted_vals = values[sorted_idx]

            for i in range(1, m):
                if sorted_vals[i] == sorted_vals[i - 1]:
                    continue
                thresh = (sorted_vals[i] + sorted_vals[i - 1]) / 2.0
                left_mask = values <= thresh
                right_mask = ~left_mask

                if left_mask.sum() < self.min_samples_leaf or right_mask.sum() < self.min_samples_leaf:
                    continue

                gain = parent_impurity - (
                    (left_mask.sum() / m) * self._impurity(y[left_mask]) +
                    (right_mask.sum() / m) * self._impurity(y[right_mask])
                )

                if gain > best_gain:
                    best_gain = gain
                    best_feat = feat
                    best_thresh = thresh
                    best_left_mask = left_mask
                    best_right_mask = right_mask

        return best_feat, best_thresh, best_left_mask, best_right_mask, best_gain

    def _build_tree(self, X, y, depth=0):
        if depth >= self.max_depth or len(y) <= self.min_samples_leaf or len(set(y)) == 1:
            return DecisionTree.Node(value=self._majority_class(y))

        feat, thresh, left_mask, right_mask, gain = self._best_split(X, y)
        if feat is None or gain <= 0.0:
            return DecisionTree.Node(value=self._majority_class(y))

        left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
        return DecisionTree.Node(feature_index=feat, threshold=thresh, left=left, right=right)

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.tree_ = self._build_tree(np.array(X), np.array(y))
        return self

    def _predict_one(self, x, node):
        while node.value is None:
            node = node.left if x[node.feature_index] <= node.threshold else node.right
        return node.value

    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        return np.array([self._predict_one(row, self.tree_) for row in np.array(X)])

    def confusion_matrix(self, y_true, y_pred):
        classes = np.unique(np.concatenate((y_true, y_pred)))
        class_to_index = {cls: idx for idx, cls in enumerate(classes)}
        matrix = np.zeros((len(classes), len(classes)), dtype=int)
        for actual, pred in zip(y_true, y_pred):
            matrix[class_to_index[actual]][class_to_index[pred]] += 1
        return matrix

    def classification_metrics(self, cm):
        TP = cm[1, 1]
        TN = cm[0, 0]
        FP = cm[0, 1]
        FN = cm[1, 0]

        accuracy = (TP + TN) / (TP + TN + FP + FN)
        precision = TP / (TP + FP) if TP + FP > 0 else 0
        recall = TP / (TP + FN) if TP + FN > 0 else 0
        f1_score = 2 * ((precision * recall) / (precision + recall)) if precision + recall > 0 else 0

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1_score
        }

In [181]:
def bootstrap_sampling(X, y, n_bootstraps=10, random_state=None):

    #Random sampling with replacement to create multiple datasets and train with each and have more train data.
     
    
    np.random.seed(random_state)
    n_samples = X.shape[0]
    bootstraps = []
    for _ in range(n_bootstraps):
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        X_boot = X.iloc[indices] if isinstance(X, pd.DataFrame) else X[indices]
        y_boot = y.iloc[indices] if isinstance(y, pd.Series) else y[indices]
        bootstraps.append((X_boot, y_boot))
    return bootstraps


def majority_vote(predictions):
   
    #Each tree votes for a class.The final prediction is the most common vote.
   
    n_estimators, n_samples = predictions.shape
    final_preds = np.zeros(n_samples, dtype=int)
    for i in range(n_samples):
        counts = np.bincount(predictions[:, i].astype(int))
        final_preds[i] = np.argmax(counts)
    return final_preds


In [182]:

features = ['Age', 'Fare', 'Sex', 'sibsp', 'Parch', 'Pclass', 'Embarked']
X = data[features]
Y = data['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

results = []

#  Model 1 
bootstraps = bootstrap_sampling(x_train, y_train, n_bootstraps=50, random_state=42)
all_pred = []
for boot_x, boot_y in bootstraps:
    tree_model = DecisionTree('gini', 25, 25)
    tree_model.fit(boot_x, boot_y)
    pred = tree_model.predict(x_test)
    all_pred.append(pred)

all_pred = np.array(all_pred)
final_y_pred = majority_vote(all_pred)

cm = tree_model.confusion_matrix(y_test, final_y_pred)
metrics = tree_model.classification_metrics(cm)
results.append(("Model 1 - Gini", metrics))

print('confusion matrix for model1:')
print(cm)
# Model 2 
bootstraps = bootstrap_sampling(x_train, y_train, n_bootstraps=50, random_state=42)
all_pred = []
for boot_x, boot_y in bootstraps:
    tree_model = DecisionTree('entropy', 25, 25)
    tree_model.fit(boot_x, boot_y)
    pred = tree_model.predict(x_test)
    all_pred.append(pred)

all_pred = np.array(all_pred)
final_y_pred = majority_vote(all_pred)

cm = tree_model.confusion_matrix(y_test, final_y_pred)
metrics = tree_model.classification_metrics(cm)
results.append(("Model 2 - Entropy", metrics))

print('confusion matrix for model2:')
print(cm)

df_results = pd.DataFrame([
    {
        "Model": name,
        "Accuracy": m["accuracy"],
        "Precision": m["precision"],
        "Recall": m["recall"],
        "F1 Score": m["f1_score"]
    }
    for name, m in results
])

print('-----------answer table-----------')
print(df_results.to_string(index=False))




confusion matrix for model1:
[[175  14]
 [ 42  31]]
confusion matrix for model2:
[[174  15]
 [ 43  30]]
-----------answer table-----------
            Model  Accuracy  Precision   Recall  F1 Score
   Model 1 - Gini  0.786260   0.688889 0.424658  0.525424
Model 2 - Entropy  0.778626   0.666667 0.410959  0.508475
