# Spaceship Titanic - V7 (Stacking Ensemble)

**Base:** V2 features (29 features, best LB 0.80710)  
**Params:** Optuna-optimized from V6 (200 trials each)  
**Ensemble:** Stacking with LogisticRegression meta-learner (replaces simple average)  

**Stacking architecture:**
1. Level 0: LightGBM, XGBoost, CatBoost → OOF predictions (10-fold)
2. Level 1: LogisticRegression on OOF probas → final prediction (5-fold)
3. Test predictions: average of level-1 test folds

**Why stacking > simple average:**
- Learns optimal blending weights per probability region
- Can correct calibration differences between models
- LogReg is linear = low overfit risk as meta-learner

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import warnings, os

warnings.filterwarnings('ignore')

SEED = 42
N_FOLDS = 10
TARGET = 'Transported'

def seed_everything(seed=SEED):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()
print('V7 Setup complete.')

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_sub = pd.read_csv('../data/sample_submission.csv')

train['is_train'] = 1
test['is_train'] = 0
test[TARGET] = np.nan
df = pd.concat([train, test], axis=0, ignore_index=True)

spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
print(f'Combined: {df.shape}')

## V2 Feature Engineering (identical)

In [None]:
# === V2 FE ===
df['_Group'] = df['PassengerId'].str.split('_').str[0].astype(int)
df['GroupSize'] = df.groupby('_Group')['PassengerId'].transform('count')
df['IsAlone'] = (df['GroupSize'] == 1).astype(int)

df['Deck'] = df['Cabin'].str.split('/').str[0]
df['CabinNum'] = df['Cabin'].str.split('/').str[1].astype(float)
df['Side'] = df['Cabin'].str.split('/').str[2]
df['CabinRegion'] = (df['CabinNum'] // 100).astype(float)

df['Surname'] = df['Name'].str.split().str[-1]
df['FamilySize'] = df.groupby('Surname')['PassengerId'].transform('count')
df.loc[df['Surname'].isna(), 'FamilySize'] = 1

df['CryoSleep'] = df['CryoSleep'].map({True: 1, False: 0, 'True': 1, 'False': 0})
df['VIP'] = df['VIP'].map({True: 1, False: 0, 'True': 1, 'False': 0})

# Imputation
for col in spend_cols:
    mask = (df['CryoSleep'] == 1) & (df[col].isna())
    df.loc[mask, col] = 0
mask = (df['CryoSleep'].isna()) & (df[spend_cols].sum(axis=1) == 0)
df.loc[mask, 'CryoSleep'] = 1
mask = (df['CryoSleep'].isna()) & (df[spend_cols].sum(axis=1) > 0)
df.loc[mask, 'CryoSleep'] = 0

for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

# Spending
df['TotalSpend'] = df[spend_cols].sum(axis=1)
df['TotalSpend_log'] = np.log1p(df['TotalSpend'])
df['NoSpend'] = (df['TotalSpend'] == 0).astype(int)
df['NumServicesUsed'] = (df[spend_cols] > 0).sum(axis=1)
for col in spend_cols:
    df[f'{col}_log'] = np.log1p(df[col])
df['LuxurySpend'] = np.log1p(df['Spa'] + df['VRDeck'] + df['RoomService'])
df['BasicSpend'] = np.log1p(df['FoodCourt'] + df['ShoppingMall'])

df['AgeGroup'] = pd.cut(df['Age'], bins=[0,5,12,18,30,50,80], labels=[0,1,2,3,4,5]).astype(float)
df['IsChild'] = (df['Age'] < 18).astype(float)
df['CryoSleep_NoSpend'] = ((df['CryoSleep'] == 1) & (df['TotalSpend'] == 0)).astype(int)

df['GroupSpend_mean'] = df.groupby('_Group')['TotalSpend'].transform('mean')
df['GroupSpend_mean_log'] = np.log1p(df['GroupSpend_mean'])

for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    le = LabelEncoder()
    df[col + '_le'] = le.fit_transform(df[col].astype(str))
for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    freq = df[col].value_counts(normalize=True)
    df[col + '_freq'] = df[col].map(freq)

print('V2 FE done.')

In [None]:
drop_cols = [
    'PassengerId', 'Name', 'Cabin', 'Surname', 'is_train', TARGET,
    'HomePlanet', 'Destination', 'Deck', 'Side',
    '_Group', 'CabinNum', 'TotalSpend', 'GroupSpend_mean',
] + spend_cols

features = [c for c in df.columns if c not in drop_cols]
print(f'V2 features: {len(features)}')

train_df = df[df['is_train'] == 1].copy()
test_df = df[df['is_train'] == 0].copy()

X = train_df[features].values
y = train_df[TARGET].astype(int).values
X_test = test_df[features].values

print(f'X: {X.shape}, y: {y.shape}, X_test: {X_test.shape}')

## Level 0: Base Models with Optuna Params

In [None]:
# Optuna-optimized params from V6
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 44,
    'max_depth': 7,
    'learning_rate': 0.030969372942932262,
    'feature_fraction': 0.6576757841672172,
    'bagging_fraction': 0.835862275982545,
    'bagging_freq': 3,
    'min_child_samples': 48,
    'reg_alpha': 1.196564578792742,
    'reg_lambda': 0.46422611416669396,
    'min_split_gain': 0.1325484921651904,
    'n_estimators': 5000,
    'verbose': -1,
    'n_jobs': -1,
    'random_state': SEED,
}

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 8,
    'learning_rate': 0.03142745431117881,
    'subsample': 0.7765649767697068,
    'colsample_bytree': 0.6993827867384605,
    'min_child_weight': 8,
    'reg_alpha': 0.018176575713448703,
    'reg_lambda': 0.0015265472174367683,
    'gamma': 1.722059975175557,
    'max_delta_step': 3,
    'n_estimators': 5000,
    'early_stopping_rounds': 200,
    'tree_method': 'hist',
    'random_state': SEED,
    'verbosity': 0,
}

cb_params = {
    'iterations': 5000,
    'learning_rate': 0.03873109015756435,
    'depth': 7,
    'l2_leaf_reg': 0.21548007012285542,
    'subsample': 0.9396092698948957,
    'colsample_bylevel': 0.5062767287275375,
    'min_data_in_leaf': 16,
    'random_strength': 3.2715423389693354,
    'bagging_temperature': 0.8108454420073308,
    'random_seed': SEED,
    'verbose': 500,
    'early_stopping_rounds': 200,
    'task_type': 'CPU',
}

print('Optuna params loaded.')

In [None]:
# === Level 0: LightGBM ===
oof_lgb = np.zeros(len(X))
test_lgb = np.zeros(len(X_test))
fi_lgb = np.zeros(len(features))

kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(500)]
    )
    
    oof_lgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_lgb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    fi_lgb += model.feature_importances_ / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_lgb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - LGB Accuracy: {fold_acc:.5f}')

lgb_acc = accuracy_score(y, (oof_lgb > 0.5).astype(int))
print(f'\nLightGBM 10-fold CV: {lgb_acc:.5f}')

In [None]:
# === Level 0: XGBoost ===
oof_xgb = np.zeros(len(X))
test_xgb = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=500)
    
    oof_xgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_xgb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_xgb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - XGB Accuracy: {fold_acc:.5f}')

xgb_acc = accuracy_score(y, (oof_xgb > 0.5).astype(int))
print(f'\nXGBoost 10-fold CV: {xgb_acc:.5f}')

In [None]:
# === Level 0: CatBoost ===
oof_cb = np.zeros(len(X))
test_cb = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    
    model = CatBoostClassifier(**cb_params)
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    
    oof_cb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_cb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_cb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - CB Accuracy: {fold_acc:.5f}')

cb_acc = accuracy_score(y, (oof_cb > 0.5).astype(int))
print(f'\nCatBoost 10-fold CV: {cb_acc:.5f}')

In [None]:
print('=== Level 0 Results ===')
print(f'LightGBM: {lgb_acc:.5f}')
print(f'XGBoost:  {xgb_acc:.5f}')
print(f'CatBoost: {cb_acc:.5f}')

# Simple average baseline (for comparison)
oof_avg = (oof_lgb + oof_xgb + oof_cb) / 3
avg_acc = accuracy_score(y, (oof_avg > 0.5).astype(int))
print(f'\nSimple Average (baseline): {avg_acc:.5f}')

# Check OOF correlation (diversity = better stacking)
corr_lgb_xgb = np.corrcoef(oof_lgb, oof_xgb)[0, 1]
corr_lgb_cb = np.corrcoef(oof_lgb, oof_cb)[0, 1]
corr_xgb_cb = np.corrcoef(oof_xgb, oof_cb)[0, 1]
print(f'\nOOF Correlation (lower = more diverse = better stacking):')
print(f'  LGB-XGB: {corr_lgb_xgb:.4f}')
print(f'  LGB-CB:  {corr_lgb_cb:.4f}')
print(f'  XGB-CB:  {corr_xgb_cb:.4f}')

## Level 1: LogisticRegression Stacking

The meta-learner sees the 3 OOF probability columns and learns optimal blending.  
We use 5-fold CV on the meta-learner to avoid data leakage.  
We test multiple C values (regularization) to find the best.

In [None]:
# Build stacking matrices
oof_stack = np.column_stack([oof_lgb, oof_xgb, oof_cb])
test_stack = np.column_stack([test_lgb, test_xgb, test_cb])

print(f'OOF stack shape: {oof_stack.shape}')
print(f'Test stack shape: {test_stack.shape}')

# Try multiple C values
print('\n=== LogReg Meta-Learner: C search ===')
best_c = None
best_stack_acc = 0

for C in [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]:
    kf_meta = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    meta_oof = np.zeros(len(y))
    
    for tr_idx, val_idx in kf_meta.split(oof_stack, y):
        meta = LogisticRegression(C=C, random_state=SEED, max_iter=1000)
        meta.fit(oof_stack[tr_idx], y[tr_idx])
        meta_oof[val_idx] = meta.predict_proba(oof_stack[val_idx])[:, 1]
    
    acc = accuracy_score(y, (meta_oof > 0.5).astype(int))
    print(f'  C={C:6.2f} -> Stacking CV: {acc:.5f}')
    
    if acc > best_stack_acc:
        best_stack_acc = acc
        best_c = C

print(f'\nBest C: {best_c}, Best Stacking CV: {best_stack_acc:.5f}')

In [None]:
# === Final stacking with best C ===
kf_meta = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

meta_oof_final = np.zeros(len(y))
meta_test_final = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(kf_meta.split(oof_stack, y)):
    meta = LogisticRegression(C=best_c, random_state=SEED, max_iter=1000)
    meta.fit(oof_stack[tr_idx], y[tr_idx])
    
    meta_oof_final[val_idx] = meta.predict_proba(oof_stack[val_idx])[:, 1]
    meta_test_final += meta.predict_proba(test_stack)[:, 1] / 5
    
    fold_acc = accuracy_score(y[val_idx], (meta_oof_final[val_idx] > 0.5).astype(int))
    print(f'Meta Fold {fold+1}/5 - Accuracy: {fold_acc:.5f}')
    
    # Show weights (coefficients)
    print(f'  Weights: LGB={meta.coef_[0][0]:.3f}, XGB={meta.coef_[0][1]:.3f}, CB={meta.coef_[0][2]:.3f}')

stack_acc = accuracy_score(y, (meta_oof_final > 0.5).astype(int))
print(f'\nStacking CV Accuracy: {stack_acc:.5f}')

## Results Comparison

In [None]:
# Also compute majority voting for reference
votes = ((oof_lgb > 0.5).astype(int) + (oof_xgb > 0.5).astype(int) + (oof_cb > 0.5).astype(int))
vote_acc = accuracy_score(y, (votes >= 2).astype(int))

print('='*60)
print('ENSEMBLE COMPARISON (V7)')
print('='*60)
print(f'\nLevel 0 Individual Models:')
print(f'  LightGBM (Optuna):  {lgb_acc:.5f}')
print(f'  XGBoost (Optuna):   {xgb_acc:.5f}')
print(f'  CatBoost (Optuna):  {cb_acc:.5f}')
print(f'\nEnsemble Methods:')
print(f'  Simple Average:     {avg_acc:.5f}')
print(f'  Majority Voting:    {vote_acc:.5f}')
print(f'  Stacking (LogReg):  {stack_acc:.5f}  <-- C={best_c}')
print(f'\nStacking vs Average:  {stack_acc - avg_acc:+.5f}')

print(f'\n=== VERSION COMPARISON ===')
print(f'V1:  CV 0.82653 | LB 0.80196 | Gap 0.0246 | simple avg')
print(f'V2:  CV 0.81410 | LB 0.80710 | Gap 0.0070 | simple avg')
print(f'V3:  CV 0.81836 | LB 0.80406 | Gap 0.0143 | simple avg')
print(f'V5:  CV 0.81767 | LB TBD     |            | simple avg + TE')
print(f'V6:  CV 0.81652 | LB TBD     |            | simple avg + Optuna')
print(f'V7:  CV {stack_acc:.5f} | LB TBD     |            | STACKING + Optuna')

In [None]:
# Generate multiple submissions for comparison

# Submission 1: Stacking
final_preds_stack = (meta_test_final > 0.5)
sub_stack = pd.DataFrame({
    'PassengerId': test_df['PassengerId'].values,
    'Transported': final_preds_stack
})
sub_stack['Transported'] = sub_stack['Transported'].astype(bool)
sub_stack.to_csv('../submissions/submission_v7_stacking.csv', index=False)

# Submission 2: Simple average (same models, for A/B comparison)
final_preds_avg = ((test_lgb + test_xgb + test_cb) / 3 > 0.5)
sub_avg = pd.DataFrame({
    'PassengerId': test_df['PassengerId'].values,
    'Transported': final_preds_avg
})
sub_avg['Transported'] = sub_avg['Transported'].astype(bool)
sub_avg.to_csv('../submissions/submission_v7_average.csv', index=False)

# Validate
for name, sub in [('stacking', sub_stack), ('average', sub_avg)]:
    assert sub.shape[0] == sample_sub.shape[0]
    assert list(sub.columns) == list(sample_sub.columns)
    assert sub['Transported'].dtype == bool

# Compare predictions
diff = (final_preds_stack != final_preds_avg).sum()
print(f'Stacking vs Average: {diff} different predictions ({diff/len(final_preds_stack)*100:.1f}%)')
print(f'\nStacking: {final_preds_stack.sum()} True ({final_preds_stack.mean():.4f})')
print(f'Average:  {final_preds_avg.sum()} True ({final_preds_avg.mean():.4f})')
print(f'\nSaved: submission_v7_stacking.csv, submission_v7_average.csv')