# Titanic Survival Prediction: LightGBM vs XGBoost

This notebook performs the following steps:

1. **Exploratory Data Analysis (EDA)**
   - Load Titanic dataset (train & test).
   - Check missing values.
   - Visualize distributions (histograms, boxplots).
   - Visualize relationships (bar plots, scatter plots).

2. **Data Preprocessing**
   - Impute missing values.
   - Feature engineering (Title, FamilySize).
   - Encode categorical variables using one-hot encoding.

3. **Model Building**
   - Split into training & validation sets.
   - Train **LightGBM** and **XGBoost** classifiers.
   - Hyperparameter tuning using RandomizedSearchCV.

4. **Comparative Analysis**
   - Evaluate both models on validation set (Accuracy, Precision, Recall, F1-score).
   - Compare results in table and bar charts.

5. **Output**
   - Save predictions for test set.
   - Save summary report and metrics.


In [2]:
import xgboost
import lightgbm
print("XGBoost version:", xgboost.__version__)
print("LightGBM version:", lightgbm.__version__)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import xgboost as xgb
import lightgbm as lgb
import json

# Paths (update if needed)
train_path = "Titanic_train.csv"
test_path = "Titanic_test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()


XGBoost version: 3.1.1
LightGBM version: 4.6.0
Train shape: (891, 12)
Test shape: (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Check missing values
print(train.isnull().sum().sort_values(ascending=False).head(10))
print(test.isnull().sum().sort_values(ascending=False).head(10))

Cabin          687
Age            177
Embarked         2
PassengerId      0
Name             0
Pclass           0
Survived         0
Sex              0
Parch            0
SibSp            0
dtype: int64
Cabin          327
Age             86
Fare             1
Name             0
Pclass           0
PassengerId      0
Sex              0
Parch            0
SibSp            0
Ticket           0
dtype: int64


In [4]:
# Feature engineering
def extract_title(name):
    if pd.isnull(name): return "Unknown"
    parts = name.split(',')
    if len(parts)>1:
        after = parts[1].strip()
        title = after.split('.')[0].strip()
        return title
    return "Unknown"

for df in [train, test]:
    df['Title'] = df['Name'].apply(extract_title)
    df['Title'] = df['Title'].replace(['Mlle','Ms'],'Miss')
    df['Title'] = df['Title'].replace(['Mme'],'Mrs')
    rare_titles = ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
    df['Title'] = df['Title'].apply(lambda t: 'Rare' if t in rare_titles else t)
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Title','FamilySize']
target = 'Survived'

for df in [train, test]:
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

X = train[features].copy()
y = train[target].copy()


In [5]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

numeric_features = ['Age','SibSp','Parch','Fare','FamilySize']
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])

categorical_features = ['Pclass','Sex','Embarked','Title']
categorical_transformer = Pipeline(steps=[('imputer_cat', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')

preprocessor.fit(X_train)
X_train_proc = preprocessor.transform(X_train)
X_val_proc = preprocessor.transform(X_val)
X_test_proc = preprocessor.transform(test[features])


In [6]:
# LightGBM tuning
lgb_clf = lgb.LGBMClassifier(objective='binary', random_state=42)
lgb_param_dist = {'num_leaves': [15,31,63], 'n_estimators':[50,100,200], 'learning_rate':[0.01,0.05,0.1], 'min_child_samples':[5,10,20]}
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
lgb_search = RandomizedSearchCV(lgb_clf, lgb_param_dist, n_iter=8, scoring='f1', cv=skf, random_state=42, n_jobs=-1)
lgb_search.fit(X_train_proc, y_train)
best_lgb = lgb_search.best_estimator_
print("Best LightGBM params:", lgb_search.best_params_)


[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Best LightGBM params: {'num_leaves': 31, 'n_estimators': 50, 'min_child_samples': 5, 'learning_rate': 0.05}


In [12]:
# XGBoost tuning
xgb_clf = xgb.XGBClassifier(eval_metric='logloss')
xgb_param_dist = {'max_depth':[3,4,6], 'n_estimators':[50,100,200], 'learning_rate':[0.01,0.05,0.1], 'subsample':[0.6,0.8,1.0]}
xgb_search = RandomizedSearchCV(xgb_clf, xgb_param_dist, n_iter=8, scoring='f1', cv=skf, random_state=42, n_jobs=-1)
xgb_search.fit(X_train_proc, y_train)
best_xgb = xgb_search.best_estimator_
print("Best XGBoost params:", xgb_search.best_params_)



Best XGBoost params: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1}


In [8]:
# Evaluation
def evaluate_model(clf, X_val, y_val, name):
    preds = clf.predict(X_val)
    acc = accuracy_score(y_val, preds)
    prec = precision_score(y_val, preds)
    rec = recall_score(y_val, preds)
    f1 = f1_score(y_val, preds)
    print(f"--- {name} ---")
    print("Accuracy:", acc, "Precision:", prec, "Recall:", rec, "F1:", f1)
    print(classification_report(y_val, preds))
    return {'accuracy':acc,'precision':prec,'recall':rec,'f1':f1}

res_lgb = evaluate_model(best_lgb, X_val_proc, y_val, "LightGBM")
res_xgb = evaluate_model(best_xgb, X_val_proc, y_val, "XGBoost")

pd.DataFrame([res_lgb,res_xgb], index=['LightGBM','XGBoost'])


--- LightGBM ---
Accuracy: 0.8156424581005587 Precision: 0.78125 Recall: 0.7246376811594203 F1: 0.7518796992481203
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.72      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179

--- XGBoost ---
Accuracy: 0.7988826815642458 Precision: 0.7619047619047619 Recall: 0.6956521739130435 F1: 0.7272727272727273
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       110
           1       0.76      0.70      0.73        69

    accuracy                           0.80       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179



Unnamed: 0,accuracy,precision,recall,f1
LightGBM,0.815642,0.78125,0.724638,0.75188
XGBoost,0.798883,0.761905,0.695652,0.727273
