# Spaceship Titanic - V2 (Anti-Overfitting)

**V1 Results:** CV 0.8265 / LB 0.8020 (ratio 0.970 = overfitting)  
**V2 Goal:** Reduce CV-LB gap by removing leaky/ID-like features  

**Changes from V1:**
- Removed: Group (ID), MemberNum (ID), CabinNum (too granular), Surname_freq (high cardinality)
- Removed: GroupSurvival_loo, FamilySurvival_loo (train/test distribution mismatch)
- Removed: redundant raw spending cols (keep log only + aggregated)
- Stronger regularization on models
- Simple average ensemble (no optimized weights)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from scipy.stats import rankdata
import warnings
import os

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

SEED = 42
N_FOLDS = 10
TARGET = 'Transported'

def seed_everything(seed=SEED):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()
print('V2 Setup complete.')

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_sub = pd.read_csv('../data/sample_submission.csv')

print(f'Train: {train.shape}, Test: {test.shape}')

train['is_train'] = 1
test['is_train'] = 0
test[TARGET] = np.nan
df = pd.concat([train, test], axis=0, ignore_index=True)
print(f'Combined: {df.shape}')

## Feature Engineering V2 - Clean & Generalizable

In [None]:
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

def feature_engineering_v2(df):
    """V2: Clean features, no ID-like columns, less redundancy."""
    
    # === PassengerId: only GroupSize and IsAlone (NO Group/MemberNum) ===
    df['_Group'] = df['PassengerId'].str.split('_').str[0].astype(int)
    df['GroupSize'] = df.groupby('_Group')['PassengerId'].transform('count')
    df['IsAlone'] = (df['GroupSize'] == 1).astype(int)
    
    # === Cabin: Deck + Side + bucketed CabinNum ===
    df['Deck'] = df['Cabin'].str.split('/').str[0]
    df['CabinNum'] = df['Cabin'].str.split('/').str[1].astype(float)
    df['Side'] = df['Cabin'].str.split('/').str[2]
    df['CabinRegion'] = (df['CabinNum'] // 100).astype(float)  # broad region
    
    # === Name: only FamilySize (no surname freq) ===
    df['Surname'] = df['Name'].str.split().str[-1]
    df['FamilySize'] = df.groupby('Surname')['PassengerId'].transform('count')
    df.loc[df['Surname'].isna(), 'FamilySize'] = 1
    
    # === Boolean features ===
    df['CryoSleep'] = df['CryoSleep'].map({True: 1, False: 0, 'True': 1, 'False': 0})
    df['VIP'] = df['VIP'].map({True: 1, False: 0, 'True': 1, 'False': 0})
    
    # === Spending: log transforms + aggregated (NO raw duplicates) ===
    df['TotalSpend'] = df[spend_cols].sum(axis=1)
    df['TotalSpend_log'] = np.log1p(df['TotalSpend'])
    df['NoSpend'] = (df['TotalSpend'] == 0).astype(int)
    df['NumServicesUsed'] = (df[spend_cols] > 0).sum(axis=1)
    
    for col in spend_cols:
        df[f'{col}_log'] = np.log1p(df[col])
    
    # Luxury vs basic
    df['LuxurySpend'] = np.log1p(df['Spa'] + df['VRDeck'] + df['RoomService'])
    df['BasicSpend'] = np.log1p(df['FoodCourt'] + df['ShoppingMall'])
    
    # === Age features ===
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 5, 12, 18, 30, 50, 80], 
                            labels=[0, 1, 2, 3, 4, 5]).astype(float)
    df['IsChild'] = (df['Age'] < 18).astype(float)
    
    # === Interactions ===
    df['CryoSleep_NoSpend'] = ((df['CryoSleep'] == 1) & (df['TotalSpend'] == 0)).astype(int)
    
    # === Group-level spending (generalizable, not ID-based) ===
    df['GroupSpend_mean'] = df.groupby('_Group')['TotalSpend'].transform('mean')
    df['GroupSpend_mean_log'] = np.log1p(df['GroupSpend_mean'])
    
    # === Encode categoricals ===
    for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
        le = LabelEncoder()
        df[col + '_le'] = le.fit_transform(df[col].astype(str))
    
    # Frequency encodings (low cardinality only)
    for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
        freq = df[col].value_counts(normalize=True)
        df[col + '_freq'] = df[col].map(freq)
    
    return df

df = feature_engineering_v2(df)
print(f'After FE: {df.shape[1]} columns')

In [None]:
# === Smart imputation ===
# CryoSleep passengers must have 0 spending
for col in spend_cols:
    mask = (df['CryoSleep'] == 1) & (df[col].isna())
    df.loc[mask, col] = 0

# Passengers with 0 total spend might be CryoSleep
mask = (df['CryoSleep'].isna()) & (df['TotalSpend'] == 0)
df.loc[mask, 'CryoSleep'] = 1
mask = (df['CryoSleep'].isna()) & (df['TotalSpend'] > 0)
df.loc[mask, 'CryoSleep'] = 0

# Fill numericals with median
for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())

# Fill categoricals with mode
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

print(f'Remaining nulls: {df.isnull().sum().sum()}')

In [None]:
# === Define features (explicit, no ID-like) ===
drop_cols = [
    'PassengerId', 'Name', 'Cabin', 'Surname', 'is_train', TARGET,
    'HomePlanet', 'Destination', 'Deck', 'Side',  # keep encoded versions
    '_Group',     # internal, ID-like
    'CabinNum',   # too granular, keep CabinRegion
    'TotalSpend', 'GroupSpend_mean',  # keep log versions
] + spend_cols  # keep log versions only

features = [c for c in df.columns if c not in drop_cols]
print(f'V2 features: {len(features)}')
for i, f in enumerate(sorted(features)):
    print(f'  {i+1:2d}. {f}')

In [None]:
# Split
train_df = df[df['is_train'] == 1].copy()
test_df = df[df['is_train'] == 0].copy()

X = train_df[features].values
y = train_df[TARGET].values.astype(int)
X_test = test_df[features].values

print(f'X: {X.shape}, y: {y.shape}, X_test: {X_test.shape}')
print(f'Target mean: {np.mean(y):.4f}')

## LightGBM (more regularized)

In [None]:
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 20,          # V1: 31 -> reduced
    'learning_rate': 0.03,
    'feature_fraction': 0.7,   # V1: 0.8 -> more regularization
    'bagging_fraction': 0.7,   # V1: 0.8 -> more regularization
    'bagging_freq': 5,
    'min_child_samples': 30,   # V1: 20 -> more regularization
    'reg_alpha': 0.5,          # V1: 0.1 -> stronger L1
    'reg_lambda': 2.0,         # V1: 1.0 -> stronger L2
    'max_depth': 5,            # V1: unlimited -> capped
    'n_estimators': 5000,
    'verbose': -1,
    'n_jobs': -1,
    'random_state': SEED,
}

oof_lgb = np.zeros(len(X))
test_lgb = np.zeros(len(X_test))
fi_lgb = np.zeros(len(features))

kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(500)]
    )
    
    oof_lgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_lgb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    fi_lgb += model.feature_importances_ / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_lgb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - Accuracy: {fold_acc:.5f}')

lgb_acc = accuracy_score(y, (oof_lgb > 0.5).astype(int))
print(f'\nLightGBM V2 CV Accuracy: {lgb_acc:.5f}')

In [None]:
# Feature importance
fi_df = pd.DataFrame({'feature': features, 'importance': fi_lgb})
fi_df = fi_df.sort_values('importance', ascending=True).tail(20)

plt.figure(figsize=(10, 7))
plt.barh(fi_df['feature'], fi_df['importance'], color='steelblue')
plt.title(f'LightGBM V2 Feature Importance - CV Acc: {lgb_acc:.5f}')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## XGBoost (more regularized)

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,             # V1: 6 -> reduced
    'learning_rate': 0.03,
    'subsample': 0.7,           # V1: 0.8
    'colsample_bytree': 0.7,    # V1: 0.8
    'min_child_weight': 10,     # V1: 5 -> more regularization
    'reg_alpha': 0.5,           # V1: 0.1
    'reg_lambda': 2.0,          # V1: 1.0
    'gamma': 0.1,               # NEW: min split loss
    'n_estimators': 5000,
    'early_stopping_rounds': 200,
    'tree_method': 'hist',
    'random_state': SEED,
    'verbosity': 0,
}

oof_xgb = np.zeros(len(X))
test_xgb = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=500)
    
    oof_xgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_xgb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_xgb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - Accuracy: {fold_acc:.5f}')

xgb_acc = accuracy_score(y, (oof_xgb > 0.5).astype(int))
print(f'\nXGBoost V2 CV Accuracy: {xgb_acc:.5f}')

## CatBoost (more regularized)

In [None]:
oof_cb = np.zeros(len(X))
test_cb = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = CatBoostClassifier(
        iterations=5000,
        learning_rate=0.03,
        depth=4,                # V1: 6 -> reduced
        l2_leaf_reg=5.0,        # V1: 3.0 -> stronger
        subsample=0.7,          # V1: 0.8
        colsample_bylevel=0.7,  # V1: 0.8
        min_data_in_leaf=30,    # V1: 20 -> more regularization
        random_seed=SEED,
        verbose=500,
        early_stopping_rounds=200,
        task_type='CPU',
    )
    
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    
    oof_cb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_cb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_cb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - Accuracy: {fold_acc:.5f}')

cb_acc = accuracy_score(y, (oof_cb > 0.5).astype(int))
print(f'\nCatBoost V2 CV Accuracy: {cb_acc:.5f}')

## Ensemble (simple average, no weight optimization)

In [None]:
print('=== V2 Individual Scores ===')
print(f'LightGBM: {lgb_acc:.5f}')
print(f'XGBoost:  {xgb_acc:.5f}')
print(f'CatBoost: {cb_acc:.5f}')

# Simple average only (no weight optimization to avoid overfitting)
oof_avg = (oof_lgb + oof_xgb + oof_cb) / 3
avg_acc = accuracy_score(y, (oof_avg > 0.5).astype(int))
print(f'\nSimple Average Ensemble: {avg_acc:.5f}')

# Majority voting
votes = (
    (oof_lgb > 0.5).astype(int) + 
    (oof_xgb > 0.5).astype(int) + 
    (oof_cb > 0.5).astype(int)
)
vote_acc = accuracy_score(y, (votes >= 2).astype(int))
print(f'Majority Voting: {vote_acc:.5f}')

# Use simple average for submission (safest for LB)
final_proba = (test_lgb + test_xgb + test_cb) / 3
final_preds = (final_proba > 0.5)

print(f'\nTest: {final_preds.sum()} True / {len(final_preds) - final_preds.sum()} False')
print(f'Ratio: {final_preds.mean():.4f}')

print(f'\n=== V1 vs V2 Comparison ===')
print(f'V1 CV: 0.82653 | V1 LB: 0.80196 | Gap: 0.02457')
print(f'V2 CV: {avg_acc:.5f} | V2 LB: TBD     | Expected gap: smaller')

## Submission

In [None]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'].values,
    'Transported': final_preds
})
submission['Transported'] = submission['Transported'].astype(bool)
submission.to_csv('../submissions/submission_v2.csv', index=False)

# Validate format
assert submission.shape[0] == sample_sub.shape[0]
assert list(submission.columns) == list(sample_sub.columns)
assert submission['Transported'].dtype == bool

print('V2 Submission saved: submissions/submission_v2.csv')
print(f'Shape: {submission.shape}')
print(submission['Transported'].value_counts(normalize=True))
submission.head()