# Spaceship Titanic - V6 (Optuna Hyperparameter Optimization)

**Base:** V2 features (29 features, best LB 0.80710)  
**Goal:** Optuna 200 trials per model on LightGBM, XGBoost, CatBoost  
**Strategy:**
- 5-fold CV for Optuna (speed)
- 10-fold CV for final training (stability)
- Early stopping in each trial
- Simple average ensemble

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import optuna
import warnings, os, time

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

SEED = 42
N_FOLDS_OPTUNA = 5   # faster for optimization
N_FOLDS_FINAL = 10   # stable for final training
N_TRIALS = 200
TARGET = 'Transported'

def seed_everything(seed=SEED):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()
print(f'V6 Setup: {N_TRIALS} trials per model, {N_FOLDS_OPTUNA}-fold optuna, {N_FOLDS_FINAL}-fold final')

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_sub = pd.read_csv('../data/sample_submission.csv')

train['is_train'] = 1
test['is_train'] = 0
test[TARGET] = np.nan
df = pd.concat([train, test], axis=0, ignore_index=True)

spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
print(f'Combined: {df.shape}')

## V2 Feature Engineering (identical)

In [None]:
# === V2 FE (copy-paste from V2) ===
df['_Group'] = df['PassengerId'].str.split('_').str[0].astype(int)
df['GroupSize'] = df.groupby('_Group')['PassengerId'].transform('count')
df['IsAlone'] = (df['GroupSize'] == 1).astype(int)

df['Deck'] = df['Cabin'].str.split('/').str[0]
df['CabinNum'] = df['Cabin'].str.split('/').str[1].astype(float)
df['Side'] = df['Cabin'].str.split('/').str[2]
df['CabinRegion'] = (df['CabinNum'] // 100).astype(float)

df['Surname'] = df['Name'].str.split().str[-1]
df['FamilySize'] = df.groupby('Surname')['PassengerId'].transform('count')
df.loc[df['Surname'].isna(), 'FamilySize'] = 1

df['CryoSleep'] = df['CryoSleep'].map({True: 1, False: 0, 'True': 1, 'False': 0})
df['VIP'] = df['VIP'].map({True: 1, False: 0, 'True': 1, 'False': 0})

# Imputation
for col in spend_cols:
    mask = (df['CryoSleep'] == 1) & (df[col].isna())
    df.loc[mask, col] = 0

mask = (df['CryoSleep'].isna()) & (df[spend_cols].sum(axis=1) == 0)
df.loc[mask, 'CryoSleep'] = 1
mask = (df['CryoSleep'].isna()) & (df[spend_cols].sum(axis=1) > 0)
df.loc[mask, 'CryoSleep'] = 0

for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

# Spending
df['TotalSpend'] = df[spend_cols].sum(axis=1)
df['TotalSpend_log'] = np.log1p(df['TotalSpend'])
df['NoSpend'] = (df['TotalSpend'] == 0).astype(int)
df['NumServicesUsed'] = (df[spend_cols] > 0).sum(axis=1)
for col in spend_cols:
    df[f'{col}_log'] = np.log1p(df[col])
df['LuxurySpend'] = np.log1p(df['Spa'] + df['VRDeck'] + df['RoomService'])
df['BasicSpend'] = np.log1p(df['FoodCourt'] + df['ShoppingMall'])

# Age
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 5, 12, 18, 30, 50, 80], labels=[0,1,2,3,4,5]).astype(float)
df['IsChild'] = (df['Age'] < 18).astype(float)

# Interactions
df['CryoSleep_NoSpend'] = ((df['CryoSleep'] == 1) & (df['TotalSpend'] == 0)).astype(int)

# Group spending
df['GroupSpend_mean'] = df.groupby('_Group')['TotalSpend'].transform('mean')
df['GroupSpend_mean_log'] = np.log1p(df['GroupSpend_mean'])

# Encodings
for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    le = LabelEncoder()
    df[col + '_le'] = le.fit_transform(df[col].astype(str))
for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    freq = df[col].value_counts(normalize=True)
    df[col + '_freq'] = df[col].map(freq)

print('V2 FE done.')

In [None]:
# V2 features (identical)
drop_cols = [
    'PassengerId', 'Name', 'Cabin', 'Surname', 'is_train', TARGET,
    'HomePlanet', 'Destination', 'Deck', 'Side',
    '_Group', 'CabinNum',
    'TotalSpend', 'GroupSpend_mean',
] + spend_cols

features = [c for c in df.columns if c not in drop_cols]
print(f'V2 features: {len(features)}')

train_df = df[df['is_train'] == 1].copy()
test_df = df[df['is_train'] == 0].copy()

X = train_df[features].values
y = train_df[TARGET].astype(int).values
X_test = test_df[features].values

print(f'X: {X.shape}, y: {y.shape}, X_test: {X_test.shape}')

## Optuna: LightGBM (200 trials)

In [None]:
def lgb_objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 8, 64),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 0.95),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 0.95),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
        'n_estimators': 5000,
        'verbose': -1,
        'n_jobs': -1,
        'random_state': SEED,
    }
    
    kf = StratifiedKFold(n_splits=N_FOLDS_OPTUNA, shuffle=True, random_state=SEED)
    scores = []
    
    for train_idx, val_idx in kf.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        
        preds = model.predict_proba(X_val)[:, 1]
        acc = accuracy_score(y_val, (preds > 0.5).astype(int))
        scores.append(acc)
    
    return np.mean(scores)

print(f'Starting LightGBM Optuna: {N_TRIALS} trials...')
t0 = time.time()

lgb_study = optuna.create_study(direction='maximize', study_name='lgb')
lgb_study.optimize(lgb_objective, n_trials=N_TRIALS, show_progress_bar=False)

print(f'\nLightGBM Optuna done in {time.time()-t0:.0f}s')
print(f'Best CV Accuracy: {lgb_study.best_value:.5f}')
print(f'Best params:')
for k, v in lgb_study.best_params.items():
    print(f'  {k}: {v}')

## Optuna: XGBoost (200 trials)

In [None]:
def xgb_objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'subsample': trial.suggest_float('subsample', 0.4, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.95),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 0.0, 2.0),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 5),
        'n_estimators': 5000,
        'early_stopping_rounds': 50,
        'tree_method': 'hist',
        'random_state': SEED,
        'verbosity': 0,
    }
    
    kf = StratifiedKFold(n_splits=N_FOLDS_OPTUNA, shuffle=True, random_state=SEED)
    scores = []
    
    for train_idx, val_idx in kf.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        model = xgb.XGBClassifier(**params)
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=0)
        
        preds = model.predict_proba(X_val)[:, 1]
        acc = accuracy_score(y_val, (preds > 0.5).astype(int))
        scores.append(acc)
    
    return np.mean(scores)

print(f'Starting XGBoost Optuna: {N_TRIALS} trials...')
t0 = time.time()

xgb_study = optuna.create_study(direction='maximize', study_name='xgb')
xgb_study.optimize(xgb_objective, n_trials=N_TRIALS, show_progress_bar=False)

print(f'\nXGBoost Optuna done in {time.time()-t0:.0f}s')
print(f'Best CV Accuracy: {xgb_study.best_value:.5f}')
print(f'Best params:')
for k, v in xgb_study.best_params.items():
    print(f'  {k}: {v}')

## Optuna: CatBoost (200 trials)

In [None]:
def cb_objective(trial):
    params = {
        'iterations': 5000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'depth': trial.suggest_int('depth', 3, 8),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.4, 0.95),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.4, 0.95),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'random_strength': trial.suggest_float('random_strength', 0.0, 5.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 5.0),
        'random_seed': SEED,
        'verbose': 0,
        'early_stopping_rounds': 50,
        'task_type': 'CPU',
    }
    
    kf = StratifiedKFold(n_splits=N_FOLDS_OPTUNA, shuffle=True, random_state=SEED)
    scores = []
    
    for train_idx, val_idx in kf.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        model = CatBoostClassifier(**params)
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
        
        preds = model.predict_proba(X_val)[:, 1]
        acc = accuracy_score(y_val, (preds > 0.5).astype(int))
        scores.append(acc)
    
    return np.mean(scores)

print(f'Starting CatBoost Optuna: {N_TRIALS} trials...')
t0 = time.time()

cb_study = optuna.create_study(direction='maximize', study_name='catboost')
cb_study.optimize(cb_objective, n_trials=N_TRIALS, show_progress_bar=False)

print(f'\nCatBoost Optuna done in {time.time()-t0:.0f}s')
print(f'Best CV Accuracy: {cb_study.best_value:.5f}')
print(f'Best params:')
for k, v in cb_study.best_params.items():
    print(f'  {k}: {v}')

## Optuna Results Summary

In [None]:
print('='*60)
print('OPTUNA OPTIMIZATION SUMMARY')
print('='*60)
print(f'LightGBM best 5-fold CV: {lgb_study.best_value:.5f}')
print(f'XGBoost  best 5-fold CV: {xgb_study.best_value:.5f}')
print(f'CatBoost best 5-fold CV: {cb_study.best_value:.5f}')
print(f'\nV2 baseline 10-fold CV:  0.81410 (simple avg ensemble)')

# Optimization history plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, study, name in zip(axes, [lgb_study, xgb_study, cb_study], ['LightGBM', 'XGBoost', 'CatBoost']):
    trials = study.trials
    values = [t.value for t in trials if t.value is not None]
    best_so_far = np.maximum.accumulate(values)
    ax.plot(values, alpha=0.3, label='Trial score')
    ax.plot(best_so_far, color='red', linewidth=2, label='Best so far')
    ax.set_title(f'{name} - Best: {study.best_value:.5f}')
    ax.set_xlabel('Trial')
    ax.set_ylabel('Accuracy')
    ax.legend()

plt.suptitle(f'Optuna Optimization ({N_TRIALS} trials each)', fontsize=14)
plt.tight_layout()
plt.show()

## Final Training with Optimized Params (10-fold CV)

In [None]:
# === LightGBM with best params ===
best_lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'n_estimators': 5000,
    'verbose': -1,
    'n_jobs': -1,
    'random_state': SEED,
    **lgb_study.best_params
}

oof_lgb = np.zeros(len(X))
test_lgb = np.zeros(len(X_test))
fi_lgb = np.zeros(len(features))

kf = StratifiedKFold(n_splits=N_FOLDS_FINAL, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = lgb.LGBMClassifier(**best_lgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(500)]
    )
    
    oof_lgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_lgb += model.predict_proba(X_test)[:, 1] / N_FOLDS_FINAL
    fi_lgb += model.feature_importances_ / N_FOLDS_FINAL
    
    fold_acc = accuracy_score(y_val, (oof_lgb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS_FINAL} - Accuracy: {fold_acc:.5f}')

lgb_acc = accuracy_score(y, (oof_lgb > 0.5).astype(int))
print(f'\nLightGBM OPTIMIZED 10-fold CV: {lgb_acc:.5f}')

In [None]:
# === XGBoost with best params ===
best_xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'n_estimators': 5000,
    'early_stopping_rounds': 200,
    'tree_method': 'hist',
    'random_state': SEED,
    'verbosity': 0,
    **xgb_study.best_params
}

oof_xgb = np.zeros(len(X))
test_xgb = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = xgb.XGBClassifier(**best_xgb_params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=500)
    
    oof_xgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_xgb += model.predict_proba(X_test)[:, 1] / N_FOLDS_FINAL
    
    fold_acc = accuracy_score(y_val, (oof_xgb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS_FINAL} - Accuracy: {fold_acc:.5f}')

xgb_acc = accuracy_score(y, (oof_xgb > 0.5).astype(int))
print(f'\nXGBoost OPTIMIZED 10-fold CV: {xgb_acc:.5f}')

In [None]:
# === CatBoost with best params ===
best_cb_params = {
    'iterations': 5000,
    'random_seed': SEED,
    'verbose': 500,
    'early_stopping_rounds': 200,
    'task_type': 'CPU',
    **cb_study.best_params
}

oof_cb = np.zeros(len(X))
test_cb = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = CatBoostClassifier(**best_cb_params)
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    
    oof_cb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_cb += model.predict_proba(X_test)[:, 1] / N_FOLDS_FINAL
    
    fold_acc = accuracy_score(y_val, (oof_cb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS_FINAL} - Accuracy: {fold_acc:.5f}')

cb_acc = accuracy_score(y, (oof_cb > 0.5).astype(int))
print(f'\nCatBoost OPTIMIZED 10-fold CV: {cb_acc:.5f}')

## Ensemble & Submission

In [None]:
print('=== V6 OPTIMIZED Results ===')
print(f'LightGBM: {lgb_acc:.5f}')
print(f'XGBoost:  {xgb_acc:.5f}')
print(f'CatBoost: {cb_acc:.5f}')

# Simple average
oof_avg = (oof_lgb + oof_xgb + oof_cb) / 3
avg_acc = accuracy_score(y, (oof_avg > 0.5).astype(int))
print(f'\nSimple Average: {avg_acc:.5f}')

# Majority voting
votes = ((oof_lgb > 0.5).astype(int) + (oof_xgb > 0.5).astype(int) + (oof_cb > 0.5).astype(int))
vote_acc = accuracy_score(y, (votes >= 2).astype(int))
print(f'Majority Voting: {vote_acc:.5f}')

# Submission
final_proba = (test_lgb + test_xgb + test_cb) / 3
final_preds = (final_proba > 0.5)

print(f'\nTest: {final_preds.sum()} True / {len(final_preds) - final_preds.sum()} False')
print(f'Ratio: {final_preds.mean():.4f}')

print(f'\n=== VERSION COMPARISON ===')
print(f'V1:  CV 0.82653 | LB 0.80196 | Gap 0.0246 | 49 feat | manual params')
print(f'V2:  CV 0.81410 | LB 0.80710 | Gap 0.0070 | 29 feat | manual params')
print(f'V3:  CV 0.81836 | LB 0.80406 | Gap 0.0143 | 56 feat | manual params')
print(f'V5:  CV 0.81767 | LB TBD     |            | 32 feat | manual params + TE')
print(f'V6:  CV {avg_acc:.5f} | LB TBD     |            | 29 feat | OPTUNA {N_TRIALS} trials')

In [None]:
# Feature importance
fi_df = pd.DataFrame({'feature': features, 'importance': fi_lgb})
fi_df = fi_df.sort_values('importance', ascending=True).tail(20)

plt.figure(figsize=(10, 7))
plt.barh(fi_df['feature'], fi_df['importance'], color='steelblue')
plt.title(f'LightGBM V6 (Optuna) Feature Importance - CV: {lgb_acc:.5f}')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

In [None]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'].values,
    'Transported': final_preds
})
submission['Transported'] = submission['Transported'].astype(bool)
submission.to_csv('../submissions/submission_v6.csv', index=False)

assert submission.shape[0] == sample_sub.shape[0]
assert list(submission.columns) == list(sample_sub.columns)
assert submission['Transported'].dtype == bool

print('V6 Submission saved: submissions/submission_v6.csv')
print(submission['Transported'].value_counts(normalize=True))
submission.head()

In [None]:
# Save best params for future reference
print('\n' + '='*60)
print('BEST HYPERPARAMETERS (save for future use)')
print('='*60)
print(f'\n# LightGBM best params:')
for k, v in lgb_study.best_params.items():
    print(f"    '{k}': {repr(v)},")
print(f'\n# XGBoost best params:')
for k, v in xgb_study.best_params.items():
    print(f"    '{k}': {repr(v)},")
print(f'\n# CatBoost best params:')
for k, v in cb_study.best_params.items():
    print(f"    '{k}': {repr(v)},")