# Titanic - Modeling

Multi-model approach: LightGBM, XGBoost, CatBoost, Random Forest, Logistic Regression  
**Metric:** Accuracy | **CV:** 5-Fold Stratified

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import optuna
import warnings
import os
import pickle

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

SEED = 42
N_FOLDS = 5
np.random.seed(SEED)

In [None]:
# Load feature-engineered data
train_fe = pd.read_csv('../data/train_fe.csv')
test_fe = pd.read_csv('../data/test_fe.csv')

features = [c for c in train_fe.columns if c not in ['PassengerId', 'Survived']]

X = train_fe[features].values
y = train_fe['Survived'].values.astype(int)
X_test = test_fe[features].values
test_ids = test_fe['PassengerId'].values

print(f'X: {X.shape}, y: {y.shape}, X_test: {X_test.shape}')
print(f'Features: {len(features)}')
print(f'Target: {np.bincount(y)} (0={np.mean(y==0):.2%}, 1={np.mean(y==1):.2%})')

In [None]:
# CV helper
kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

def evaluate_model(model, X, y, model_name='Model'):
    """Evaluate model with stratified K-Fold CV."""
    oof_preds = np.zeros(len(y))
    oof_probs = np.zeros(len(y))
    test_preds = np.zeros(X_test.shape[0])
    test_probs = np.zeros(X_test.shape[0])
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        model.fit(X_tr, y_tr)

        val_pred = model.predict(X_val)
        oof_preds[val_idx] = val_pred

        if hasattr(model, 'predict_proba'):
            val_prob = model.predict_proba(X_val)[:, 1]
            oof_probs[val_idx] = val_prob
            test_probs += model.predict_proba(X_test)[:, 1] / N_FOLDS

        test_preds += model.predict(X_test) / N_FOLDS

        fold_acc = accuracy_score(y_val, val_pred)
        fold_scores.append(fold_acc)

    mean_acc = np.mean(fold_scores)
    std_acc = np.std(fold_scores)
    print(f'{model_name}: CV Accuracy = {mean_acc:.5f} (+/- {std_acc:.5f})')
    print(f'  Folds: {[f"{s:.4f}" for s in fold_scores]}')

    return oof_preds, oof_probs, test_preds, test_probs, mean_acc

## 1. Baseline Models

In [None]:
results = {}

# Logistic Regression (needs scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression(C=1.0, max_iter=1000, random_state=SEED)
oof_lr, probs_lr, test_lr, tprobs_lr, acc_lr = evaluate_model(lr, X_scaled, y, 'LogisticRegression')
results['LR'] = {'oof': probs_lr, 'test': tprobs_lr, 'acc': acc_lr}

In [None]:
# Random Forest
rf = RandomForestClassifier(
    n_estimators=500, max_depth=8, min_samples_split=5,
    min_samples_leaf=2, max_features='sqrt',
    random_state=SEED, n_jobs=-1
)
oof_rf, probs_rf, test_rf, tprobs_rf, acc_rf = evaluate_model(rf, X, y, 'RandomForest')
results['RF'] = {'oof': probs_rf, 'test': tprobs_rf, 'acc': acc_rf}

In [None]:
# Gradient Boosting
gb = GradientBoostingClassifier(
    n_estimators=500, max_depth=4, learning_rate=0.05,
    subsample=0.8, min_samples_split=5,
    random_state=SEED
)
oof_gb, probs_gb, test_gb, tprobs_gb, acc_gb = evaluate_model(gb, X, y, 'GradientBoosting')
results['GB'] = {'oof': probs_gb, 'test': tprobs_gb, 'acc': acc_gb}

## 2. LightGBM with Optuna Tuning

In [None]:
def lgb_objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 15, 63),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_estimators': 1000,
        'verbose': -1,
        'random_state': SEED,
    }

    scores = []
    for train_idx, val_idx in kf.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
        )
        preds = model.predict(X_val)
        scores.append(accuracy_score(y_val, preds))

    return np.mean(scores)

study_lgb = optuna.create_study(direction='maximize', study_name='lgb')
study_lgb.optimize(lgb_objective, n_trials=100, show_progress_bar=True)

print(f'\nBest LGB CV: {study_lgb.best_value:.5f}')
print(f'Best params: {study_lgb.best_params}')

In [None]:
# Train LightGBM with best params
best_lgb_params = study_lgb.best_params
best_lgb_params.update({
    'objective': 'binary', 'metric': 'binary_logloss',
    'boosting_type': 'gbdt', 'n_estimators': 1000,
    'verbose': -1, 'random_state': SEED
})

lgb_model = lgb.LGBMClassifier(**best_lgb_params)
oof_lgb, probs_lgb, test_lgb, tprobs_lgb, acc_lgb = evaluate_model(lgb_model, X, y, 'LightGBM (tuned)')
results['LGB'] = {'oof': probs_lgb, 'test': tprobs_lgb, 'acc': acc_lgb}

## 3. XGBoost with Optuna Tuning

In [None]:
def xgb_objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 5.0, log=True),
        'n_estimators': 1000,
        'random_state': SEED,
        'verbosity': 0,
    }

    scores = []
    for train_idx, val_idx in kf.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        preds = model.predict(X_val)
        scores.append(accuracy_score(y_val, preds))

    return np.mean(scores)

study_xgb = optuna.create_study(direction='maximize', study_name='xgb')
study_xgb.optimize(xgb_objective, n_trials=100, show_progress_bar=True)

print(f'\nBest XGB CV: {study_xgb.best_value:.5f}')
print(f'Best params: {study_xgb.best_params}')

In [None]:
# Train XGBoost with best params
best_xgb_params = study_xgb.best_params
best_xgb_params.update({
    'objective': 'binary:logistic', 'eval_metric': 'logloss',
    'n_estimators': 1000, 'random_state': SEED, 'verbosity': 0
})

xgb_model = xgb.XGBClassifier(**best_xgb_params)
oof_xgb, probs_xgb, test_xgb, tprobs_xgb, acc_xgb = evaluate_model(xgb_model, X, y, 'XGBoost (tuned)')
results['XGB'] = {'oof': probs_xgb, 'test': tprobs_xgb, 'acc': acc_xgb}

## 4. CatBoost with Optuna Tuning

In [None]:
def cb_objective(trial):
    params = {
        'iterations': 1000,
        'depth': trial.suggest_int('depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_seed': SEED,
        'verbose': 0,
        'early_stopping_rounds': 50,
    }

    scores = []
    for train_idx, val_idx in kf.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        model = CatBoostClassifier(**params)
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=0)
        preds = model.predict(X_val)
        scores.append(accuracy_score(y_val, preds))

    return np.mean(scores)

study_cb = optuna.create_study(direction='maximize', study_name='cb')
study_cb.optimize(cb_objective, n_trials=100, show_progress_bar=True)

print(f'\nBest CatBoost CV: {study_cb.best_value:.5f}')
print(f'Best params: {study_cb.best_params}')

In [None]:
# Train CatBoost with best params
best_cb_params = study_cb.best_params
best_cb_params.update({
    'iterations': 1000, 'random_seed': SEED,
    'verbose': 0, 'early_stopping_rounds': 50
})

cb_model = CatBoostClassifier(**best_cb_params)
oof_cb, probs_cb, test_cb, tprobs_cb, acc_cb = evaluate_model(cb_model, X, y, 'CatBoost (tuned)')
results['CB'] = {'oof': probs_cb, 'test': tprobs_cb, 'acc': acc_cb}

## 5. SVM

In [None]:
svm = SVC(C=1.0, kernel='rbf', gamma='scale', probability=True, random_state=SEED)
oof_svm, probs_svm, test_svm, tprobs_svm, acc_svm = evaluate_model(svm, X_scaled, y, 'SVM')
results['SVM'] = {'oof': probs_svm, 'test': tprobs_svm, 'acc': acc_svm}

## 6. Model Comparison

In [None]:
# Compare all models
comparison = pd.DataFrame({
    'Model': list(results.keys()),
    'CV Accuracy': [results[m]['acc'] for m in results]
}).sort_values('CV Accuracy', ascending=False)

print('=== Model Comparison ===')
print(comparison.to_string(index=False))

fig, ax = plt.subplots(figsize=(10, 5))
bars = ax.barh(comparison['Model'], comparison['CV Accuracy'], color='#3498db')
ax.set_xlabel('CV Accuracy')
ax.set_title('Model Comparison')
ax.set_xlim(comparison['CV Accuracy'].min() - 0.02, comparison['CV Accuracy'].max() + 0.01)
for bar, acc in zip(bars, comparison['CV Accuracy']):
    ax.text(acc + 0.001, bar.get_y() + bar.get_height()/2, f'{acc:.4f}', va='center')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance (LightGBM)
lgb_final = lgb.LGBMClassifier(**best_lgb_params)
lgb_final.fit(X, y)

imp = pd.DataFrame({
    'feature': features,
    'importance': lgb_final.feature_importances_
}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(data=imp, x='importance', y='feature', ax=ax, color='#3498db')
ax.set_title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()
print(imp.to_string(index=False))

In [None]:
# Save OOF predictions and test predictions for ensemble
oof_df = pd.DataFrame({
    'PassengerId': train_fe['PassengerId'],
    'Survived': y,
})
test_df = pd.DataFrame({'PassengerId': test_ids})

for name in results:
    oof_df[f'prob_{name}'] = results[name]['oof']
    test_df[f'prob_{name}'] = results[name]['test']

oof_df.to_csv('../data/oof_predictions.csv', index=False)
test_df.to_csv('../data/test_predictions.csv', index=False)

print('Saved OOF and test predictions for ensemble.')
print(f'Models: {list(results.keys())}')