# Spaceship Titanic - V5 (V2 + Regularized Target Encoding)

**History:**
- V1: CV 0.8265 / LB 0.8020 (49 features, overfit)
- V2: CV 0.8141 / LB 0.8071 (29 features, best LB)
- V3: CV 0.8184 / LB 0.8041 (56 features, TE on interactions = overfit)
- V4: CV 0.8146 / LB TBD (33 features, V2+simple EDA)

**V5 Strategy:** V2 base (29 features) + ONLY target encoding on 3 low-cardinality categoricals:
- HomePlanet (3 cats), Destination (3 cats), Deck (8 cats)

**Why V3 TE failed:**
- Encoded HIGH-cardinality interactions (DeckSide: 16 cats, PlanetDeck: 24 cats)
- Small samples per category → noisy means → overfit
- 56 total features = too much noise

**V5 TE safeguards:**
1. 10-fold OOF (matching main CV) instead of V3's 5-fold
2. Bayesian smoothing: blend cat_mean with global_mean weighted by sample size
3. Only 3 low-cardinality columns → stable means (min ~300 samples per cat)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import warnings, os

warnings.filterwarnings('ignore')

SEED = 42
N_FOLDS = 10
TARGET = 'Transported'

def seed_everything(seed=SEED):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()
print('V5 Setup complete.')

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_sub = pd.read_csv('../data/sample_submission.csv')

print(f'Train: {train.shape}, Test: {test.shape}')

train['is_train'] = 1
test['is_train'] = 0
test[TARGET] = np.nan
df = pd.concat([train, test], axis=0, ignore_index=True)

spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
print(f'Combined: {df.shape}')

## Feature Engineering V5 = V2 Base (identical)

In [None]:
# === V2 Feature Engineering (IDENTICAL copy) ===

# PassengerId
df['_Group'] = df['PassengerId'].str.split('_').str[0].astype(int)
df['GroupSize'] = df.groupby('_Group')['PassengerId'].transform('count')
df['IsAlone'] = (df['GroupSize'] == 1).astype(int)

# Cabin
df['Deck'] = df['Cabin'].str.split('/').str[0]
df['CabinNum'] = df['Cabin'].str.split('/').str[1].astype(float)
df['Side'] = df['Cabin'].str.split('/').str[2]
df['CabinRegion'] = (df['CabinNum'] // 100).astype(float)

# Name
df['Surname'] = df['Name'].str.split().str[-1]
df['FamilySize'] = df.groupby('Surname')['PassengerId'].transform('count')
df.loc[df['Surname'].isna(), 'FamilySize'] = 1

# Booleans
df['CryoSleep'] = df['CryoSleep'].map({True: 1, False: 0, 'True': 1, 'False': 0})
df['VIP'] = df['VIP'].map({True: 1, False: 0, 'True': 1, 'False': 0})

print('Parsing done.')

In [None]:
# === V2 Imputation (identical) ===
for col in spend_cols:
    mask = (df['CryoSleep'] == 1) & (df[col].isna())
    df.loc[mask, col] = 0

mask = (df['CryoSleep'].isna()) & (df[spend_cols].sum(axis=1) == 0)
df.loc[mask, 'CryoSleep'] = 1
mask = (df['CryoSleep'].isna()) & (df[spend_cols].sum(axis=1) > 0)
df.loc[mask, 'CryoSleep'] = 0

for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())

for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

print(f'Imputation done. Nulls: {df.isnull().sum().sum()}')

In [None]:
# === V2 Spending features (identical) ===
df['TotalSpend'] = df[spend_cols].sum(axis=1)
df['TotalSpend_log'] = np.log1p(df['TotalSpend'])
df['NoSpend'] = (df['TotalSpend'] == 0).astype(int)
df['NumServicesUsed'] = (df[spend_cols] > 0).sum(axis=1)

for col in spend_cols:
    df[f'{col}_log'] = np.log1p(df[col])

df['LuxurySpend'] = np.log1p(df['Spa'] + df['VRDeck'] + df['RoomService'])
df['BasicSpend'] = np.log1p(df['FoodCourt'] + df['ShoppingMall'])

# Age
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 5, 12, 18, 30, 50, 80],
                        labels=[0, 1, 2, 3, 4, 5]).astype(float)
df['IsChild'] = (df['Age'] < 18).astype(float)

# Interactions
df['CryoSleep_NoSpend'] = ((df['CryoSleep'] == 1) & (df['TotalSpend'] == 0)).astype(int)

# Group spending
df['GroupSpend_mean'] = df.groupby('_Group')['TotalSpend'].transform('mean')
df['GroupSpend_mean_log'] = np.log1p(df['GroupSpend_mean'])

# Label encoding
for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    le = LabelEncoder()
    df[col + '_le'] = le.fit_transform(df[col].astype(str))

# Frequency encoding
for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    freq = df[col].value_counts(normalize=True)
    df[col + '_freq'] = df[col].map(freq)

print('V2 features done.')

## V5 Addition: Regularized Target Encoding

**Bayesian smoothed target encoding** on 3 low-cardinality features:
- HomePlanet (3 categories, min ~2000 samples each)
- Destination (3 categories, min ~1500 samples each)
- Deck (8 categories, min ~300 samples each)

Formula: `TE = (n * cat_mean + m * global_mean) / (n + m)`  
where `m` = smoothing factor (higher = more regularization)

In [None]:
# Check category sizes to confirm safety
print('Category sizes (train only):')
train_mask = df['is_train'] == 1
for col in ['HomePlanet', 'Destination', 'Deck']:
    print(f'\n{col}:')
    print(df.loc[train_mask, col].value_counts().sort_index())

In [None]:
# === REGULARIZED TARGET ENCODING ===

te_cols = ['HomePlanet', 'Destination', 'Deck']
SMOOTHING = 20  # Bayesian smoothing factor

train_idx = df[df['is_train'] == 1].index
test_idx = df[df['is_train'] == 0].index

# Convert target for train rows
y_for_te = df.loc[train_idx, TARGET].map({True: 1.0, False: 0.0, 1: 1.0, 0: 0.0}).astype(float)
df.loc[train_idx, '_target_float'] = y_for_te.values
global_mean = float(y_for_te.mean())

print(f'Global mean: {global_mean:.4f}')
print(f'Smoothing factor: {SMOOTHING}')
print(f'TE folds: {N_FOLDS} (matching main CV)')

# 10-fold OOF target encoding with Bayesian smoothing
kf_te = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for col in te_cols:
    col_te = f'{col}_te'
    df[col_te] = global_mean  # safe initialization
    
    # OOF for train
    for fold_train, fold_val in kf_te.split(train_idx, y_for_te):
        fold_train_idx = train_idx[fold_train]
        fold_val_idx = train_idx[fold_val]
        
        # Compute stats on fold's training portion
        stats = df.loc[fold_train_idx].groupby(col)['_target_float'].agg(['mean', 'count'])
        
        # Bayesian smoothing: blend category mean with global mean
        stats['te'] = (stats['count'] * stats['mean'] + SMOOTHING * global_mean) / (stats['count'] + SMOOTHING)
        
        # Map to validation fold
        mapped = df.loc[fold_val_idx, col].map(stats['te']).fillna(global_mean)
        df.loc[fold_val_idx, col_te] = mapped.values
    
    # For test: use ALL train data with smoothing
    stats = df.loc[train_idx].groupby(col)['_target_float'].agg(['mean', 'count'])
    stats['te'] = (stats['count'] * stats['mean'] + SMOOTHING * global_mean) / (stats['count'] + SMOOTHING)
    mapped = df.loc[test_idx, col].map(stats['te']).fillna(global_mean)
    df.loc[test_idx, col_te] = mapped.values
    
    # Show the encoding values
    print(f'\n{col}_te values:')
    te_vals = df.loc[train_idx].groupby(col)[col_te].mean()
    print(te_vals.sort_values())

# Cleanup
df.drop('_target_float', axis=1, inplace=True)
print('\nTarget encoding done.')

In [None]:
# === DEFINE FEATURES: V2 base + 3 TE features ===
drop_cols = [
    'PassengerId', 'Name', 'Cabin', 'Surname', 'is_train', TARGET,
    'HomePlanet', 'Destination', 'Deck', 'Side',
    '_Group', 'CabinNum',
    'TotalSpend', 'GroupSpend_mean',
] + spend_cols

features = [c for c in df.columns if c not in drop_cols]
print(f'V5 features: {len(features)} (V2: 29 + 3 TE = 32)')
print()
for i, f in enumerate(sorted(features)):
    marker = ' *** NEW' if f.endswith('_te') else ''
    print(f'  {i+1:2d}. {f}{marker}')

In [None]:
# Split
train_df = df[df['is_train'] == 1].copy()
test_df = df[df['is_train'] == 0].copy()

X = train_df[features].values
y = train_df[TARGET].astype(int).values
X_test = test_df[features].values

print(f'X: {X.shape}, y: {y.shape}, X_test: {X_test.shape}')
print(f'Target mean: {np.mean(y):.4f}')

## LightGBM V5 (V2 regularization)

In [None]:
# V2 regularization params (identical)
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 20,
    'learning_rate': 0.03,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'min_child_samples': 30,
    'reg_alpha': 0.5,
    'reg_lambda': 2.0,
    'max_depth': 5,
    'n_estimators': 5000,
    'verbose': -1,
    'n_jobs': -1,
    'random_state': SEED,
}

oof_lgb = np.zeros(len(X))
test_lgb = np.zeros(len(X_test))
fi_lgb = np.zeros(len(features))

kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(500)]
    )
    
    oof_lgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_lgb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    fi_lgb += model.feature_importances_ / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_lgb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - Accuracy: {fold_acc:.5f}')

lgb_acc = accuracy_score(y, (oof_lgb > 0.5).astype(int))
print(f'\nLightGBM V5 CV: {lgb_acc:.5f}')

In [None]:
# Feature importance - check if TE features are useful
fi_df = pd.DataFrame({'feature': features, 'importance': fi_lgb})
fi_df = fi_df.sort_values('importance', ascending=True).tail(20)

plt.figure(figsize=(10, 7))
colors = ['coral' if f.endswith('_te') else 'steelblue' for f in fi_df['feature']]
plt.barh(fi_df['feature'], fi_df['importance'], color=colors)
plt.title(f'LightGBM V5 Feature Importance - CV: {lgb_acc:.5f}\n(coral = target encoded)')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

# TE features ranking
fi_all = pd.DataFrame({'feature': features, 'importance': fi_lgb}).sort_values('importance', ascending=False)
print('\nTE features ranking:')
for _, row in fi_all.iterrows():
    if row['feature'].endswith('_te'):
        rank = fi_all.index.get_loc(_) + 1 if hasattr(fi_all.index, 'get_loc') else 0
        print(f"  {row['feature']}: importance={row['importance']:.1f}")

## XGBoost V5 (V2 regularization)

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'learning_rate': 0.03,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 10,
    'reg_alpha': 0.5,
    'reg_lambda': 2.0,
    'gamma': 0.1,
    'n_estimators': 5000,
    'early_stopping_rounds': 200,
    'tree_method': 'hist',
    'random_state': SEED,
    'verbosity': 0,
}

oof_xgb = np.zeros(len(X))
test_xgb = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=500)
    
    oof_xgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_xgb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_xgb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - Accuracy: {fold_acc:.5f}')

xgb_acc = accuracy_score(y, (oof_xgb > 0.5).astype(int))
print(f'\nXGBoost V5 CV: {xgb_acc:.5f}')

## CatBoost V5 (V2 regularization)

In [None]:
oof_cb = np.zeros(len(X))
test_cb = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    
    model = CatBoostClassifier(
        iterations=5000,
        learning_rate=0.03,
        depth=4,
        l2_leaf_reg=5.0,
        subsample=0.7,
        colsample_bylevel=0.7,
        min_data_in_leaf=30,
        random_seed=SEED,
        verbose=500,
        early_stopping_rounds=200,
        task_type='CPU',
    )
    
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    
    oof_cb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_cb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_cb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - Accuracy: {fold_acc:.5f}')

cb_acc = accuracy_score(y, (oof_cb > 0.5).astype(int))
print(f'\nCatBoost V5 CV: {cb_acc:.5f}')

## Ensemble & Submission

In [None]:
print('=== V5 Results ===')
print(f'LightGBM: {lgb_acc:.5f}')
print(f'XGBoost:  {xgb_acc:.5f}')
print(f'CatBoost: {cb_acc:.5f}')

# Simple average
oof_avg = (oof_lgb + oof_xgb + oof_cb) / 3
avg_acc = accuracy_score(y, (oof_avg > 0.5).astype(int))
print(f'\nSimple Average: {avg_acc:.5f}')

# Majority voting
votes = ((oof_lgb > 0.5).astype(int) + (oof_xgb > 0.5).astype(int) + (oof_cb > 0.5).astype(int))
vote_acc = accuracy_score(y, (votes >= 2).astype(int))
print(f'Majority Voting: {vote_acc:.5f}')

# Submission
final_proba = (test_lgb + test_xgb + test_cb) / 3
final_preds = (final_proba > 0.5)

print(f'\nTest: {final_preds.sum()} True / {len(final_preds) - final_preds.sum()} False')
print(f'Ratio: {final_preds.mean():.4f}')

print(f'\n=== VERSION COMPARISON ===')
print(f'V1: CV 0.82653 | LB 0.80196 | Gap 0.0246 | 49 features')
print(f'V2: CV 0.81410 | LB 0.80710 | Gap 0.0070 | 29 features')
print(f'V3: CV 0.81836 | LB 0.80406 | Gap 0.0143 | 56 features (TE on interactions)')
print(f'V4: CV 0.81456 | LB TBD     |            | 33 features')
print(f'V5: CV {avg_acc:.5f} | LB TBD     |            | {len(features)} features (TE low-card only)')

In [None]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'].values,
    'Transported': final_preds
})
submission['Transported'] = submission['Transported'].astype(bool)
submission.to_csv('../submissions/submission_v5.csv', index=False)

assert submission.shape[0] == sample_sub.shape[0]
assert list(submission.columns) == list(sample_sub.columns)
assert submission['Transported'].dtype == bool

print('V5 Submission saved: submissions/submission_v5.csv')
print(submission['Transported'].value_counts(normalize=True))
submission.head()