In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [3]:
# Load Data
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df  = pd.read_csv("/kaggle/input/titanic/test.csv")

In [4]:
#Feature engineering

# 1. Family Size: sum of siblings/spouses and parents/children + yourself
train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"] + 1

# 2. Is Alone: 1 if the passenger has no family on board, 0 otherwise
train_df["IsAlone"] = (train_df["FamilySize"] == 1).astype(int)

# 3. Title from Name: Extract text between the comma and period
# This uses a regular expression to pull titles like "Mr", "Mrs", "Master", etc.
train_df["Title"] = train_df["Name"].str.extract(r",\s*([^\.]+)\.") #(..)->capture group [..] -> The logic
train_df["Title"] = train_df["Title"].str.strip()

# Optional: Map rare titles to a 'Rare' category to help the model generalize
# This prevents overfitting on titles that only appear once or twice
rare_titles = ['Don', 'Rev', 'Dr', 'Major', 'Lady', 'Sir', 'Col', 'Capt', 'the Countess', 'Jonkheer']
train_df["Title"] = train_df["Title"].replace(rare_titles, 'Rare')
train_df["Title"] = train_df["Title"].replace(['Mlle', 'Ms'], 'Miss')
train_df["Title"] = train_df["Title"].replace('Mme', 'Mrs')

# Drop columns that are no longer needed (since we extracted their info)
# This prevents the model from trying to process the raw 'Name' string
train_df.drop(['Name', 'SibSp', 'Parch'], axis=1, inplace=True)


In [5]:
# Define target & features
X = train_df.drop(columns=["Survived"])
y = train_df["Survived"]

In [6]:
# Train/Validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y # to preserve the original class distribution, since the Survival classes are imbalanced
)

In [7]:
#Pre-processing
#Identify column types
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

In [31]:
# Apply imputation & StandardScaler
numeric_transformer1 = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())  # to fix the Convergence Warning
])

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

In [32]:
# Preprocessing pipeline
preprocessor1 = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer1, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
# Cross-Validation
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

1. Ligistic Regressio

In [33]:
# Logistic regression model
lg = LogisticRegression(max_iter=1000) #sets a hard limit on the number of optimization steps

# Full pipeline
lg_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor1),
        ("model", lg)
    ]
)

# Tree based Models
Since tree-based models like Gradient Boosting or Random Forests split data based on relative value rankings rather than absolute distances, they are invariant to feature scaling.

In [34]:
#Define transformers (Scaling removed for Tree-based models)
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

2. Random Forest
   Captures non-linear interactions

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

In [35]:
# Full pipeline
rf_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", rf)
    ]
)

3. Gradient Boosting

In [36]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier = GradientBoostingClassifier(
                                            n_estimators=100, 
                                            learning_rate=0.1, 
                                            max_depth=3, 
                                            random_state=42)

# Full pipeline
gb_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", gb_classifier)
    ]
)

In [37]:
models = {
    "LogReg": lg_pipeline,
    "RandomForest": rf_pipeline,
    "GradientBoosting": gb_pipeline
}

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")



LogReg: 0.8384 (+/- 0.0067)
RandomForest: 0.8372 (+/- 0.0143)
GradientBoosting: 0.8271 (+/- 0.0085)


## Model Comparison Results

Models evaluated:
- Logistic Regression
- Random Forest
- Gradient Boosting

Results:
- LogReg CV accuracy: 0.8384 (+/- 0.0067)
- RF CV accuracy: 0.8372 (+/- 0.0143)
- GB CV accuracy: 0.8271 (+/- 0.0085)

Observations:
- Which model performed best?
  Logistic Regression (LogReg) is the best performer with a CV accuracy of 0.8384.
It numerically outperformed both Random Forest (0.8372) and Gradient Boosting (0.8271).
The fact that a simple linear model outperformed complex ensembles suggests your data may have a predominantly linear structure or a small sample size where complex models are prone to overfitting.

- Did variance increase?
  Yes, variance increased significantly as model complexity increased:
LogReg has the lowest variance (+/- 0.0067), indicating high stability across different data folds.
Random Forest shows the highest variance (+/- 0.0143), more than double that of Logistic Regression. This suggests the RF model is highly sensitive to the specific training samples in each fold.
Gradient Boosting has moderate variance (+/- 0.0085) but the lowest overall accuracy, likely due to under-tuning or a lack of complex non-linear patterns to exploit.


- Does the performance gain justify complexity?
    Does the performance gain justify complexity?
No. In this specific case, there is no performance gain to justify the added complexity.
Negative Gain: Moving from LogReg to GB actually results in a ~1.1% decrease in accuracy while increasing computational costs.
Operational Efficiency: Logistic Regression is significantly faster for inference and easier to interpret.
Stability: LogReg offers the most "reliable" predictions given its superior accuracy and lowest standard deviation.
