# Session 15-16 Decision Tree and Ensemble Learning

# Exercise: Titanic Survival Prediction

You are a data scientist tasked with analyzing the Titanic dataset

https://www.kaggle.com/competitions/titanic

The goal of this exercise is to build and compare classification models that predict whether a passenger survived the Titanic disaster.

# Step 1. Data Preparation

## Goal: Load data, select features, handle missing values, encode categorical variables.

In [None]:
# Step 1: Data Preparation

import pandas as pd
import numpy as np

In [None]:
# Load Titanic dataset
data = pd.read_csv("train.csv")

# Inspect dataset
print(data.head())
print(data.info())

In [None]:
# Target variable
y = data["Survived"]

# Select relevant features
features = ["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch"]
X = data[features].copy()  # use copy to avoid SettingWithCopyWarning

# Handle missing values
X["Age"] = X["Age"].fillna(X["Age"].median())

# Encode categorical variable
# male -> 0, female -> 1
X["Sex"] = X["Sex"].map({"male": 0, "female": 1})

# Step 2. Trainâ€“Test Split

## Goal: Split data into 70% training and 30% testing.

In [None]:
# Step 2: Train-Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

# Step 3. Model Development

## Goal: Train Decision Tree, Random Forest, and AdaBoost using the same feature set.

In [None]:
# Step 3: Model Development

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Initialize models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42)
}

# Train all models
for name, model in models.items():
    model.fit(X_train, y_train)

# Step 4. Model Evaluation

## Goal: Evaluate each model using Accuracy, Precision, Recall, ROC-AUC.

In [None]:
# Step 4: Model Evaluation

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

results = {}

for name, model in models.items():
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_pred_proba)
    }

# Display results
for model_name, metrics in results.items():
    print(f"\n{model_name} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

## Drawing ROC curve

In [None]:
# (1): Import required functions
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# (2): Compute False Positive Rate (FPR) and True Positive Rate (TPR)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# (3): Compute Area Under the Curve (AUC)
roc_auc = auc(fpr, tpr)

# (4): Plot ROC curve

plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.3f})")

# Diagonal line = random classifier
plt.plot([0, 1], [0, 1], linestyle="--", label="Random classifier")

# Labels and title
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - SVM Titanic Survival Prediction")
plt.legend()
plt.show()

# Step 5. Confusion Matrix Analysis

## Goal: Compute and interpret TP, FP, TN, FN for each model.

In [None]:
# Step 5: Confusion Matrix Analysis

from sklearn.metrics import confusion_matrix

for name, model in models.items():
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    
    print(f"\n{name} Confusion Matrix:")
    print(cm)
    print("TP:", TP, "- Correctly predicted survivors")
    print("FP:", FP, "- Predicted survived but did not")
    print("TN:", TN, "- Correctly predicted non-survivors")
    print("FN:", FN, "- Predicted non-survivor but survived")

# Step 6. Model Comparison

## Goal: Compare models and discuss which performs best.

In [None]:
# Step 6: Model Comparison

comparison_df = pd.DataFrame(results).T
print("\nModel Comparison Table:")
print(comparison_df)

### Discussion:
- Decision Tree is simple but prone to overfitting.
- Random Forest reduces variance by averaging many trees.
- AdaBoost focuses on hard-to-classify samples.
- Typically, Random Forest or AdaBoost achieves the best ROC-AUC.