# Spaceship Titanic - V3 (EDA-Driven Features)

**V1:** CV 0.8265 / LB 0.8020 (overfitting)  
**V2:** CV 0.8141 / LB 0.8071 (clean, good ratio)  
**V3 Goal:** Add EDA-driven features that generalize well  

**New features from EDA:**
- Deck x Side interaction
- HomePlanet x Deck interaction  
- Fine age buckets (baby/child/teen/adult)
- Spending profile (% per category)
- Non-CryoSleep + NoSpend signal
- n_missing as feature
- Target encoding (CV-safe) for key categoricals

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import warnings, os

warnings.filterwarnings('ignore')

SEED = 42
N_FOLDS = 10
TARGET = 'Transported'

def seed_everything(seed=SEED):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()
print('V3 Setup complete.')

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_sub = pd.read_csv('../data/sample_submission.csv')

train['is_train'] = 1
test['is_train'] = 0
test[TARGET] = np.nan
df = pd.concat([train, test], axis=0, ignore_index=True)

# Convert target to float early (True->1.0, False->0.0, NaN stays NaN)
df[TARGET] = df[TARGET].map({True: 1.0, False: 0.0})

spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
print(f'Combined: {df.shape}')

## Feature Engineering V3

In [None]:
# ============================================================
# STEP 1: Parse raw columns
# ============================================================

# PassengerId -> Group, GroupSize
df['_Group'] = df['PassengerId'].str.split('_').str[0].astype(int)
df['GroupSize'] = df.groupby('_Group')['PassengerId'].transform('count')
df['IsAlone'] = (df['GroupSize'] == 1).astype(int)

# Cabin -> Deck, CabinNum, Side
df['Deck'] = df['Cabin'].str.split('/').str[0]
df['CabinNum'] = df['Cabin'].str.split('/').str[1].astype(float)
df['Side'] = df['Cabin'].str.split('/').str[2]

# Name -> Surname, FamilySize
df['Surname'] = df['Name'].str.split().str[-1]
df['FamilySize'] = df.groupby('Surname')['PassengerId'].transform('count')
df.loc[df['Surname'].isna(), 'FamilySize'] = 1

# Booleans
df['CryoSleep'] = df['CryoSleep'].map({True: 1, False: 0, 'True': 1, 'False': 0})
df['VIP'] = df['VIP'].map({True: 1, False: 0, 'True': 1, 'False': 0})

print('Step 1 done: raw parsing')

In [None]:
# ============================================================
# STEP 2: Smart imputation BEFORE feature engineering
# ============================================================

# CryoSleep passengers must have 0 spending
for col in spend_cols:
    mask = (df['CryoSleep'] == 1) & (df[col].isna())
    df.loc[mask, col] = 0

# Infer CryoSleep from spending
total_spend_raw = df[spend_cols].sum(axis=1)
df.loc[(df['CryoSleep'].isna()) & (total_spend_raw == 0), 'CryoSleep'] = 1
df.loc[(df['CryoSleep'].isna()) & (total_spend_raw > 0), 'CryoSleep'] = 0

# HomePlanet can be inferred from Deck (EDA insight)
# Europa -> B,C,A,T ; Earth -> F,G,E(partial) ; Mars -> F,D,E(partial)
deck_to_planet = {
    'A': 'Europa', 'B': 'Europa', 'C': 'Europa', 'T': 'Europa',
    'G': 'Earth'
}
for deck, planet in deck_to_planet.items():
    mask = (df['HomePlanet'].isna()) & (df['Deck'] == deck)
    df.loc[mask, 'HomePlanet'] = planet

# Fill remaining with median/mode
for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())

for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

print(f'Step 2 done: imputation. Nulls remaining: {df.isnull().sum().sum()}')

In [None]:
# ============================================================
# STEP 3: Spending features (EDA-driven)
# ============================================================

# Total and log transforms
df['TotalSpend'] = df[spend_cols].sum(axis=1)
df['TotalSpend_log'] = np.log1p(df['TotalSpend'])
df['NoSpend'] = (df['TotalSpend'] == 0).astype(int)
df['NumServicesUsed'] = (df[spend_cols] > 0).sum(axis=1)

for col in spend_cols:
    df[f'{col}_log'] = np.log1p(df[col])

# NEW: Spending PROFILE (% per category) - EDA showed this discriminates
for col in spend_cols:
    df[f'{col}_pct'] = df[col] / (df['TotalSpend'] + 1)

# NEW: Luxury vs Basic (EDA: not-transported = luxury spenders)
df['LuxurySpend'] = df['RoomService'] + df['Spa'] + df['VRDeck']
df['BasicSpend'] = df['FoodCourt'] + df['ShoppingMall']
df['LuxurySpend_log'] = np.log1p(df['LuxurySpend'])
df['BasicSpend_log'] = np.log1p(df['BasicSpend'])
df['LuxuryRatio'] = df['LuxurySpend'] / (df['TotalSpend'] + 1)

# NEW: Binary spending flags
df['HasLuxurySpend'] = (df['LuxurySpend'] > 0).astype(int)
df['HasBasicSpend'] = (df['BasicSpend'] > 0).astype(int)

# NEW: Non-CryoSleep with no spending (EDA: 62% transported)
df['Awake_NoSpend'] = ((df['CryoSleep'] == 0) & (df['TotalSpend'] == 0)).astype(int)

print('Step 3 done: spending features')

In [None]:
# ============================================================
# STEP 4: Age features (EDA-driven fine buckets)
# ============================================================

# Fine age buckets based on EDA
df['IsBaby'] = (df['Age'] <= 4).astype(float)      # 81% transported
df['IsChild'] = ((df['Age'] > 4) & (df['Age'] < 13)).astype(float)  # ~65% transported
df['IsTeen'] = ((df['Age'] >= 13) & (df['Age'] < 18)).astype(float)  # ~55%
df['IsYoungAdult'] = ((df['Age'] >= 18) & (df['Age'] < 30)).astype(float)
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 4, 12, 17, 30, 50, 80],
                        labels=[0, 1, 2, 3, 4, 5]).astype(float)

# NEW: Children under 13 cannot spend on certain things
df['IsMinor'] = (df['Age'] < 13).astype(float)  # baby + child

print('Step 4 done: age features')

In [None]:
# ============================================================
# STEP 5: Cabin features (EDA-driven interactions)
# ============================================================

# Broad cabin region (not too granular)
df['CabinRegion'] = (df['CabinNum'] // 100).astype(float)

# NEW: Deck x Side interaction (EDA showed some combos very predictive)
df['DeckSide'] = df['Deck'] + '_' + df['Side']

# NEW: HomePlanet x Deck interaction (EDA: HP determines Deck)
df['PlanetDeck'] = df['HomePlanet'] + '_' + df['Deck']

# NEW: HomePlanet x Destination
df['PlanetDest'] = df['HomePlanet'] + '_' + df['Destination']

print('Step 5 done: cabin interactions')

In [None]:
# ============================================================
# STEP 6: n_missing and data quality features
# ============================================================

# We need original data for this, reload briefly
train_orig = pd.read_csv('../data/train.csv')
test_orig = pd.read_csv('../data/test.csv')
test_orig[TARGET] = np.nan
df_orig = pd.concat([train_orig, test_orig], axis=0, ignore_index=True)

df['n_missing'] = df_orig.isnull().sum(axis=1)
df['has_missing'] = (df['n_missing'] > 0).astype(int)

del train_orig, test_orig, df_orig
print('Step 6 done: missing features')

In [None]:
# ============================================================
# STEP 7: Group-level features (safe, no survival leakage)
# ============================================================

# Group spending (computed on combined train+test = no leakage)
df['GroupSpend_mean_log'] = np.log1p(df.groupby('_Group')['TotalSpend'].transform('mean'))
df['GroupSpend_std'] = df.groupby('_Group')['TotalSpend'].transform('std').fillna(0)

# Group age stats
df['GroupAge_mean'] = df.groupby('_Group')['Age'].transform('mean')
df['GroupAge_std'] = df.groupby('_Group')['Age'].transform('std').fillna(0)

# Group CryoSleep rate (from combined data = safe)
df['GroupCryo_rate'] = df.groupby('_Group')['CryoSleep'].transform('mean')

print('Step 7 done: group features')

In [None]:
# ============================================================
# STEP 8: Encode all categoricals
# ============================================================

# Label encode low-cardinality categoricals
label_cols = ['HomePlanet', 'Destination', 'Deck', 'Side']
for col in label_cols:
    le = LabelEncoder()
    df[col + '_le'] = le.fit_transform(df[col].astype(str))

# Frequency encode (low cardinality = safe)
for col in label_cols:
    freq = df[col].value_counts(normalize=True)
    df[col + '_freq'] = df[col].map(freq)

# NEW: Frequency encode interaction features
for col in ['DeckSide', 'PlanetDeck', 'PlanetDest']:
    le = LabelEncoder()
    df[col + '_le'] = le.fit_transform(df[col].astype(str))
    freq = df[col].value_counts(normalize=True)
    df[col + '_freq'] = df[col].map(freq)

print('Step 8 done: encoding')

In [None]:
# ============================================================
# STEP 9: Target encoding (CV-safe) for key categoricals
# ============================================================

train_mask = df['is_train'] == 1
train_idx = df[train_mask].index
test_idx = df[~train_mask].index

te_cols = ['DeckSide', 'PlanetDeck', 'PlanetDest', 'Deck']
kf_te = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
y_train_te = df.loc[train_idx, TARGET].astype(int).values

for col in te_cols:
    df[col + '_te'] = 0.5  # initialize with prior
    global_mean = float(y_train_te.mean())
    
    # OOF target encoding for train
    for fold_train, fold_val in kf_te.split(train_idx, y_train_te):
        fold_train_idx = train_idx[fold_train]
        fold_val_idx = train_idx[fold_val]
        
        means = df.loc[fold_train_idx].groupby(col)[TARGET].mean()
        mapped = df.loc[fold_val_idx, col].map(means).fillna(global_mean)
        df.loc[fold_val_idx, col + '_te'] = mapped.values
    
    # For test: use full train
    means = df.loc[train_idx].groupby(col)[TARGET].mean()
    mapped = df.loc[test_idx, col].map(means).fillna(global_mean)
    df.loc[test_idx, col + '_te'] = mapped.values

print('Step 9 done: target encoding')

In [None]:
# ============================================================
# DEFINE FINAL FEATURES
# ============================================================

drop_cols = [
    'PassengerId', 'Name', 'Cabin', 'Surname', 'is_train', TARGET,
    'HomePlanet', 'Destination', 'Deck', 'Side',
    '_Group', 'CabinNum',
    'TotalSpend', 'LuxurySpend', 'BasicSpend', 'GroupSpend_std',
    'DeckSide', 'PlanetDeck', 'PlanetDest',
] + spend_cols

features = [c for c in df.columns if c not in drop_cols]
print(f'V3 features: {len(features)}')
print()
for i, f in enumerate(sorted(features)):
    print(f'  {i+1:2d}. {f}')

In [None]:
# Split
train_df = df[df['is_train'] == 1].copy()
test_df = df[df['is_train'] == 0].copy()

X = train_df[features].values
y = train_df[TARGET].values.astype(int)
X_test = test_df[features].values

print(f'X: {X.shape}, y: {y.shape}, X_test: {X_test.shape}')
print(f'Target: {np.mean(y):.4f}')

## LightGBM V3

In [None]:
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 24,
    'learning_rate': 0.03,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'min_child_samples': 25,
    'reg_alpha': 0.3,
    'reg_lambda': 1.5,
    'max_depth': 5,
    'n_estimators': 5000,
    'verbose': -1,
    'n_jobs': -1,
    'random_state': SEED,
}

oof_lgb = np.zeros(len(X))
test_lgb = np.zeros(len(X_test))
fi_lgb = np.zeros(len(features))

kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(500)]
    )
    
    oof_lgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_lgb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    fi_lgb += model.feature_importances_ / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_lgb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - Accuracy: {fold_acc:.5f}')

lgb_acc = accuracy_score(y, (oof_lgb > 0.5).astype(int))
print(f'\nLightGBM V3 CV: {lgb_acc:.5f}')

In [None]:
# Feature importance
fi_df = pd.DataFrame({'feature': features, 'importance': fi_lgb})
fi_df = fi_df.sort_values('importance', ascending=True).tail(25)

plt.figure(figsize=(10, 8))
plt.barh(fi_df['feature'], fi_df['importance'], color='steelblue')
plt.title(f'LightGBM V3 Feature Importance (Top 25) - CV: {lgb_acc:.5f}')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## XGBoost V3

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 5,
    'learning_rate': 0.03,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 8,
    'reg_alpha': 0.3,
    'reg_lambda': 1.5,
    'gamma': 0.05,
    'n_estimators': 5000,
    'early_stopping_rounds': 200,
    'tree_method': 'hist',
    'random_state': SEED,
    'verbosity': 0,
}

oof_xgb = np.zeros(len(X))
test_xgb = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=500)
    
    oof_xgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_xgb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_xgb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - Accuracy: {fold_acc:.5f}')

xgb_acc = accuracy_score(y, (oof_xgb > 0.5).astype(int))
print(f'\nXGBoost V3 CV: {xgb_acc:.5f}')

## CatBoost V3

In [None]:
oof_cb = np.zeros(len(X))
test_cb = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = CatBoostClassifier(
        iterations=5000,
        learning_rate=0.03,
        depth=5,
        l2_leaf_reg=4.0,
        subsample=0.7,
        colsample_bylevel=0.7,
        min_data_in_leaf=25,
        random_seed=SEED,
        verbose=500,
        early_stopping_rounds=200,
        task_type='CPU',
    )
    
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    
    oof_cb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_cb += model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    fold_acc = accuracy_score(y_val, (oof_cb[val_idx] > 0.5).astype(int))
    print(f'Fold {fold+1}/{N_FOLDS} - Accuracy: {fold_acc:.5f}')

cb_acc = accuracy_score(y, (oof_cb > 0.5).astype(int))
print(f'\nCatBoost V3 CV: {cb_acc:.5f}')

## Ensemble & Submission

In [None]:
print('=== V3 Results ===')
print(f'LightGBM: {lgb_acc:.5f}')
print(f'XGBoost:  {xgb_acc:.5f}')
print(f'CatBoost: {cb_acc:.5f}')

# Simple average (safest)
oof_avg = (oof_lgb + oof_xgb + oof_cb) / 3
avg_acc = accuracy_score(y, (oof_avg > 0.5).astype(int))
print(f'\nSimple Average: {avg_acc:.5f}')

# Majority voting
votes = ((oof_lgb > 0.5).astype(int) + (oof_xgb > 0.5).astype(int) + (oof_cb > 0.5).astype(int))
vote_acc = accuracy_score(y, (votes >= 2).astype(int))
print(f'Majority Voting: {vote_acc:.5f}')

# Use simple average for submission
final_proba = (test_lgb + test_xgb + test_cb) / 3
final_preds = (final_proba > 0.5)

print(f'\nTest: {final_preds.sum()} True / {len(final_preds) - final_preds.sum()} False')
print(f'Ratio: {final_preds.mean():.4f}')

print(f'\n=== VERSION COMPARISON ===')
print(f'V1: CV 0.82653 | LB 0.80196 | Gap 0.0246')
print(f'V2: CV 0.81410 | LB 0.80710 | Gap 0.0070')
print(f'V3: CV {avg_acc:.5f} | LB TBD')

In [None]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'].values,
    'Transported': final_preds
})
submission['Transported'] = submission['Transported'].astype(bool)
submission.to_csv('../submissions/submission_v3.csv', index=False)

assert submission.shape[0] == sample_sub.shape[0]
assert list(submission.columns) == list(sample_sub.columns)
assert submission['Transported'].dtype == bool

print('V3 Submission saved: submissions/submission_v3.csv')
print(submission['Transported'].value_counts(normalize=True))
submission.head()