In [91]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import math
%matplotlib inline


In [92]:
df = pd.read_csv('/Users/ashutoshregmi/ML_Prac/Titanic_Decision_Tree/Titanic-Dataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [93]:
print("=== TITANIC DATASET ANALYSIS ===")
print(f"Dataset shape: {df.shape}")
print(f"Survival rate: {df['Survived'].mean():.3f}")

print("\n=== SURVIVAL PATTERNS ===")
print("\nBy Sex:")
sex_survival = df.groupby('Sex')['Survived'].agg(['count', 'sum', lambda x: x.mean()]).round(3)
sex_survival.columns = ['Total', 'Survived', 'Rate']
print(sex_survival)

print("\nBy Class:")
class_survival = df.groupby('Pclass')['Survived'].agg(['count', 'sum', lambda x: x.mean()]).round(3)
class_survival.columns = ['Total', 'Survived', 'Rate']
print(class_survival)

print("\nBy Sex and Class:")
sex_class_survival = df.groupby(['Sex', 'Pclass'])['Survived'].agg(['count', 'sum', lambda x: x.mean()]).round(3)
sex_class_survival.columns = ['Total', 'Survived', 'Rate']
print(sex_class_survival)

=== TITANIC DATASET ANALYSIS ===
Dataset shape: (891, 12)
Survival rate: 0.384

=== SURVIVAL PATTERNS ===

By Sex:
        Total  Survived   Rate
Sex                           
female    314       233  0.742
male      577       109  0.189

By Class:
        Total  Survived   Rate
Pclass                        
1         216       136  0.630
2         184        87  0.473
3         491       119  0.242

By Sex and Class:
               Total  Survived   Rate
Sex    Pclass                        
female 1          94        91  0.968
       2          76        70  0.921
       3         144        72  0.500
male   1         122        45  0.369
       2         108        17  0.157
       3         347        47  0.135


In [94]:
# Age analysis
df_age = df.dropna(subset=['Age'])
children = df_age[df_age['Age'] < 18]
adults = df_age[df_age['Age'] >= 18]
print(f"\nChildren (<18): {children['Survived'].sum()}/{len(children)} survived ({children['Survived'].mean():.3f})")
print(f"Adults (18+): {adults['Survived'].sum()}/{len(adults)} survived ({adults['Survived'].mean():.3f})")


Children (<18): 61/113 survived (0.540)
Adults (18+): 229/601 survived (0.381)


In [95]:
# Decision Tree Node Class
class DecisionNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None, samples=0):
        self.feature = feature      # Feature to split on
        self.threshold = threshold  # Threshold value for split
        self.left = left           # Left subtree
        self.right = right         # Right subtree  
        self.value = value         # Prediction value (for leaf nodes)
        self.samples = samples     # Number of samples in this node

In [96]:
# Decision Tree Implementation
class HandMadeDecisionTree:
    def __init__(self, max_depth=5, min_samples_split=10):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None
        
    def entropy(self, y):
        """Calculate entropy of target variable"""
        if len(y) == 0:
            return 0
        
        counts = Counter(y)
        probs = [count / len(y) for count in counts.values()]
        entropy = -sum(p * math.log2(p) for p in probs if p > 0)
        return entropy
    
    def information_gain(self, X, y, feature, threshold):
        """Calculate information gain for a split"""
        # Parent entropy
        parent_entropy = self.entropy(y)
        
        # Split the data
        if X[feature].dtype in ['object', 'category']:
            # Categorical split
            left_mask = X[feature] == threshold
        else:
            # Numerical split
            left_mask = X[feature] <= threshold
            
        right_mask = ~left_mask
        
        # Check if split is valid
        if sum(left_mask) == 0 or sum(right_mask) == 0:
            return 0
        
        # Calculate weighted entropy of children
        n = len(y)
        left_entropy = self.entropy(y[left_mask])
        right_entropy = self.entropy(y[right_mask])
        
        weighted_entropy = (sum(left_mask) / n) * left_entropy + (sum(right_mask) / n) * right_entropy
        
        return parent_entropy - weighted_entropy
    
    def find_best_split(self, X, y):
        """Find the best feature and threshold to split on"""
        best_gain = -1
        best_feature = None
        best_threshold = None
        
        # Try each feature
        for feature in X.columns:
            if feature in ['PassengerId', 'Name', 'Ticket', 'Cabin']:
                continue  # Skip non-predictive features
                
            if X[feature].dtype in ['object', 'category']:
                # Categorical feature
                unique_values = X[feature].dropna().unique()
                for value in unique_values:
                    gain = self.information_gain(X, y, feature, value)
                    if gain > best_gain:
                        best_gain = gain
                        best_feature = feature
                        best_threshold = value
            else:
                # Numerical feature
                sorted_values = sorted(X[feature].dropna().unique())
                for i in range(len(sorted_values) - 1):
                    threshold = (sorted_values[i] + sorted_values[i + 1]) / 2
                    gain = self.information_gain(X, y, feature, threshold)
                    if gain > best_gain:
                        best_gain = gain
                        best_feature = feature
                        best_threshold = threshold
        
        return best_feature, best_threshold, best_gain
    
    def build_tree(self, X, y, depth=0):
        """Recursively build the decision tree"""
        n_samples = len(y)
        
        # Stopping conditions
        if (depth >= self.max_depth or 
            n_samples < self.min_samples_split or 
            len(set(y)) == 1):
            # Create leaf node
            most_common = Counter(y).most_common(1)[0][0]
            return DecisionNode(value=most_common, samples=n_samples)
        
        # Find best split
        best_feature, best_threshold, best_gain = self.find_best_split(X, y)
        
        if best_gain <= 0:
            # No good split found, create leaf
            most_common = Counter(y).most_common(1)[0][0]
            return DecisionNode(value=most_common, samples=n_samples)
        
        # Split the data
        if X[best_feature].dtype in ['object', 'category']:
            left_mask = X[best_feature] == best_threshold
        else:
            left_mask = X[best_feature] <= best_threshold
            
        right_mask = ~left_mask
        
        # Create child nodes
        left_child = self.build_tree(X[left_mask], y[left_mask], depth + 1)
        right_child = self.build_tree(X[right_mask], y[right_mask], depth + 1)
        
        return DecisionNode(
            feature=best_feature,
            threshold=best_threshold,
            left=left_child,
            right=right_child,
            samples=n_samples
        )
    
    def fit(self, X, y):
        """Train the decision tree"""
        self.root = self.build_tree(X, y.values)
    
    def predict_single(self, x, node):
        """Make prediction for a single sample"""
        if node.value is not None:
            return node.value
        
        feature_value = x[node.feature]
        
        # Handle missing values
        if pd.isna(feature_value):
            # Go to the child with more samples (majority)
            if node.left.samples >= node.right.samples:
                return self.predict_single(x, node.left)
            else:
                return self.predict_single(x, node.right)
        
        # Navigate tree
        if isinstance(node.threshold, str):
            # Categorical
            if feature_value == node.threshold:
                return self.predict_single(x, node.left)
            else:
                return self.predict_single(x, node.right)
        else:
            # Numerical
            if feature_value <= node.threshold:
                return self.predict_single(x, node.left)
            else:
                return self.predict_single(x, node.right)
    
    def predict(self, X):
        """Make predictions for multiple samples"""
        predictions = []
        for _, row in X.iterrows():
            pred = self.predict_single(row, self.root)
            predictions.append(pred)
        return np.array(predictions)
    
    def print_tree(self, node=None, depth=0):
        """Print the tree structure"""
        if node is None:
            node = self.root
            
        if node.value is not None:
            print("  " * depth + f"Predict: {node.value} (samples: {node.samples})")
        else:
            if isinstance(node.threshold, str):
                print("  " * depth + f"If {node.feature} == '{node.threshold}' (samples: {node.samples}):")
            else:
                print("  " * depth + f"If {node.feature} <= {node.threshold:.2f} (samples: {node.samples}):")
            
            self.print_tree(node.left, depth + 1)
            
            if isinstance(node.threshold, str):
                print("  " * depth + f"Else {node.feature} != '{node.threshold}':")
            else:
                print("  " * depth + f"Else {node.feature} > {node.threshold:.2f}:")
            
            self.print_tree(node.right, depth + 1)


In [97]:
# Hand-Crafted Rules Decision Tree (Based on Analysis)
class SimpleTitanicRules:
    """
    Simple rule-based classifier based on observed patterns:
    - Women have 74.2% survival rate
    - Men have 18.9% survival rate  
    - Children have better survival rates
    - First class has better survival rates
    """
    
    def predict(self, X):
        predictions = []
        for _, row in X.iterrows():
            # Rule 1: Women more likely to survive
            if row['Sex'] == 'female':
                predictions.append(1)
            # Rule 2: Male children more likely to survive
            elif row['Sex'] == 'male' and pd.notna(row.get('Age')) and row['Age'] < 18:
                predictions.append(1)
            # Rule 3: First class males have better chances
            elif row['Sex'] == 'male' and row['Pclass'] == 1:
                predictions.append(1)
            else:
                predictions.append(0)
        
        return np.array(predictions)
    
    def print_rules(self):
        print("\n=== HAND-CRAFTED DECISION RULES ===")
        print("1. If passenger is female → SURVIVED")
        print("2. If passenger is male AND age < 18 → SURVIVED") 
        print("3. If passenger is male AND class = 1 → SURVIVED")
        print("4. Otherwise → NOT SURVIVED")

In [98]:
# Prepare data for modeling
# Fill missing ages with median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Select features for the tree
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = df[features].copy()

# Fill missing values
X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)
X['Fare'].fillna(X['Fare'].median(), inplace=True)

y = df['Survived']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

In [99]:
# Split data manually (80-20 split)
np.random.seed(42)
indices = np.random.permutation(len(X))
train_size = int(0.8 * len(X))
train_indices = indices[:train_size]
test_indices = indices[train_size:]

X_train = X.iloc[train_indices].reset_index(drop=True)
X_test = X.iloc[test_indices].reset_index(drop=True)
y_train = y.iloc[train_indices].reset_index(drop=True)
y_test = y.iloc[test_indices].reset_index(drop=True)

print("\n=== MODEL TRAINING ===")
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Train the hand-made decision tree
print("\nTraining hand-made decision tree...")
tree = HandMadeDecisionTree(max_depth=5, min_samples_split=10)
tree.fit(X_train, y_train)

# Train the simple rules classifier
rules_classifier = SimpleTitanicRules()
rules_classifier.print_rules()

# Make predictions
tree_pred_train = tree.predict(X_train)
tree_pred_test = tree.predict(X_test)

rules_pred_train = rules_classifier.predict(X_train)
rules_pred_test = rules_classifier.predict(X_test)

# Calculate accuracy manually
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def confusion_matrix_manual(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return np.array([[tn, fp], [fn, tp]])



=== MODEL TRAINING ===
Training set size: 712
Test set size: 179

Training hand-made decision tree...

=== HAND-CRAFTED DECISION RULES ===
1. If passenger is female → SURVIVED
2. If passenger is male AND age < 18 → SURVIVED
3. If passenger is male AND class = 1 → SURVIVED
4. Otherwise → NOT SURVIVED


In [100]:
print("\n=== RESULTS ===")
print("\nDecision Tree Performance:")
print(f"Training Accuracy: {accuracy(y_train, tree_pred_train):.3f}")
print(f"Test Accuracy: {accuracy(y_test, tree_pred_test):.3f}")

print("\nSimple Rules Performance:")
print(f"Training Accuracy: {accuracy(y_train, rules_pred_train):.3f}")
print(f"Test Accuracy: {accuracy(y_test, rules_pred_test):.3f}")

print("\n=== DECISION TREE STRUCTURE ===")
tree.print_tree()

# Confusion matrices
print("\n=== CONFUSION MATRICES ===")
print("\nDecision Tree - Test Set:")
cm_tree = confusion_matrix_manual(y_test, tree_pred_test)
print(f"True Negatives: {cm_tree[0,0]}, False Positives: {cm_tree[0,1]}")
print(f"False Negatives: {cm_tree[1,0]}, True Positives: {cm_tree[1,1]}")

print("\nSimple Rules - Test Set:")
cm_rules = confusion_matrix_manual(y_test, rules_pred_test)
print(f"True Negatives: {cm_rules[0,0]}, False Positives: {cm_rules[0,1]}")
print(f"False Negatives: {cm_rules[1,0]}, True Positives: {cm_rules[1,1]}")

# Feature importance (manual calculation)
print("\n=== FEATURE ANALYSIS ===")
print("Most important patterns observed:")
print("1. Sex is the strongest predictor (74.2% vs 18.9% survival)")
print("2. Class matters significantly (63.0% vs 47.3% vs 24.2%)")  
print("3. Age affects survival (children have 54.0% vs adults 38.1%)")
print("4. Combined effects: Female + First Class = 96.8% survival")


=== RESULTS ===

Decision Tree Performance:
Training Accuracy: 0.844
Test Accuracy: 0.844

Simple Rules Performance:
Training Accuracy: 0.730
Test Accuracy: 0.743

=== DECISION TREE STRUCTURE ===
If Sex == 'male' (samples: 712):
  If Fare <= 26.27 (samples: 459):
    If Age <= 13.50 (samples: 334):
      If SibSp <= 2.00 (samples: 12):
        If Age <= 10.00 (samples: 11):
          Predict: 1 (samples: 9)
        Else Age > 10.00:
          Predict: 1 (samples: 2)
      Else SibSp > 2.00:
        Predict: 0 (samples: 1)
    Else Age > 13.50:
      If Age <= 32.50 (samples: 322):
        If Age <= 30.75 (samples: 243):
          Predict: 0 (samples: 229)
        Else Age > 30.75:
          Predict: 0 (samples: 14)
      Else Age > 32.50:
        If Fare <= 11.39 (samples: 79):
          Predict: 0 (samples: 45)
        Else Fare > 11.39:
          Predict: 0 (samples: 34)
  Else Fare > 26.27:
    If SibSp <= 2.50 (samples: 125):
      If Age <= 13.50 (samples: 108):
        Predict: 