# Titanic - Ensemble & Submission

Multiple ensemble strategies:
1. Weighted Average (optimized weights)
2. Rank Average
3. Majority Voting
4. Stacking (Logistic Regression meta-learner)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize
from scipy.stats import rankdata
import warnings
warnings.filterwarnings('ignore')

SEED = 42
THRESHOLD = 0.5

In [None]:
# Load predictions from modeling notebook
oof_df = pd.read_csv('../data/oof_predictions.csv')
test_df = pd.read_csv('../data/test_predictions.csv')
sample_sub = pd.read_csv('../data/gender_submission.csv')

y = oof_df['Survived'].values.astype(int)
model_cols = [c for c in oof_df.columns if c.startswith('prob_')]
model_names = [c.replace('prob_', '') for c in model_cols]

oof_probs = oof_df[model_cols].values
test_probs = test_df[model_cols].values
test_ids = test_df['PassengerId'].values

print(f'Models: {model_names}')
print(f'OOF shape: {oof_probs.shape}, Test shape: {test_probs.shape}')

# Individual model CV scores
print('\n=== Individual Model CV Scores ===')
for i, name in enumerate(model_names):
    preds = (oof_probs[:, i] > THRESHOLD).astype(int)
    acc = accuracy_score(y, preds)
    print(f'{name}: {acc:.5f}')

## 1. Simple Average Ensemble

In [None]:
# Simple average
oof_avg = oof_probs.mean(axis=1)
test_avg = test_probs.mean(axis=1)

avg_acc = accuracy_score(y, (oof_avg > THRESHOLD).astype(int))
print(f'Simple Average CV Accuracy: {avg_acc:.5f}')

## 2. Optimized Weighted Average

In [None]:
# Find optimal weights using scipy minimize
def neg_accuracy(weights):
    weights = np.abs(weights) / np.sum(np.abs(weights))  # normalize
    blend = np.dot(oof_probs, weights)
    preds = (blend > THRESHOLD).astype(int)
    return -accuracy_score(y, preds)

n_models = len(model_names)
initial_weights = np.ones(n_models) / n_models

# Multiple random restarts
best_result = None
best_score = -1

for _ in range(100):
    w0 = np.random.dirichlet(np.ones(n_models))
    result = minimize(neg_accuracy, w0, method='Nelder-Mead',
                      options={'maxiter': 10000})
    if -result.fun > best_score:
        best_score = -result.fun
        best_result = result

opt_weights = np.abs(best_result.x) / np.sum(np.abs(best_result.x))

oof_weighted = np.dot(oof_probs, opt_weights)
test_weighted = np.dot(test_probs, opt_weights)

weighted_acc = accuracy_score(y, (oof_weighted > THRESHOLD).astype(int))
print(f'Optimized Weighted Average CV: {weighted_acc:.5f}')
print(f'\nOptimal weights:')
for name, w in zip(model_names, opt_weights):
    print(f'  {name}: {w:.4f}')

## 3. Rank Average

In [None]:
# Rank average - robust to different probability scales
oof_ranks = np.column_stack([
    rankdata(oof_probs[:, i]) / len(y) for i in range(n_models)
])
test_ranks = np.column_stack([
    rankdata(test_probs[:, i]) / len(test_ids) for i in range(n_models)
])

oof_rank_avg = oof_ranks.mean(axis=1)
test_rank_avg = test_ranks.mean(axis=1)

rank_acc = accuracy_score(y, (oof_rank_avg > THRESHOLD).astype(int))
print(f'Rank Average CV Accuracy: {rank_acc:.5f}')

## 4. Majority Voting

In [None]:
# Hard voting (majority)
oof_votes = (oof_probs > THRESHOLD).astype(int)
test_votes = (test_probs > THRESHOLD).astype(int)

oof_majority = (oof_votes.mean(axis=1) > 0.5).astype(int)
test_majority = (test_votes.mean(axis=1) > 0.5).astype(int)

vote_acc = accuracy_score(y, oof_majority)
print(f'Majority Voting CV Accuracy: {vote_acc:.5f}')

## 5. Stacking (Meta-learner)

In [None]:
# Stacking with Logistic Regression meta-learner
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
oof_stack = np.zeros(len(y))
test_stack = np.zeros(len(test_ids))

for fold, (train_idx, val_idx) in enumerate(kf.split(oof_probs, y)):
    X_tr = oof_probs[train_idx]
    y_tr = y[train_idx]
    X_val = oof_probs[val_idx]

    meta = LogisticRegression(C=1.0, random_state=SEED, max_iter=1000)
    meta.fit(X_tr, y_tr)

    oof_stack[val_idx] = meta.predict_proba(X_val)[:, 1]
    test_stack += meta.predict_proba(test_probs)[:, 1] / 5

stack_acc = accuracy_score(y, (oof_stack > THRESHOLD).astype(int))
print(f'Stacking CV Accuracy: {stack_acc:.5f}')

## 6. Threshold Optimization

In [None]:
# Find optimal threshold for the best ensemble
ensembles = {
    'Simple Average': (oof_avg, test_avg),
    'Weighted Average': (oof_weighted, test_weighted),
    'Rank Average': (oof_rank_avg, test_rank_avg),
    'Stacking': (oof_stack, test_stack),
}

print('=== Ensemble Comparison ===')
best_ensemble_name = None
best_ensemble_acc = 0

for name, (oof, test) in ensembles.items():
    # Find optimal threshold
    best_thr = 0.5
    best_acc = 0
    for thr in np.arange(0.3, 0.7, 0.01):
        acc = accuracy_score(y, (oof > thr).astype(int))
        if acc > best_acc:
            best_acc = acc
            best_thr = thr

    print(f'{name}: {best_acc:.5f} (threshold={best_thr:.2f})')

    if best_acc > best_ensemble_acc:
        best_ensemble_acc = best_acc
        best_ensemble_name = name

print(f'\nBest ensemble: {best_ensemble_name} ({best_ensemble_acc:.5f})')

## 7. Generate Submissions

In [None]:
# Generate all submissions
import os
os.makedirs('../submissions', exist_ok=True)

submissions = {
    'weighted_avg': (test_weighted > THRESHOLD).astype(int),
    'rank_avg': (test_rank_avg > THRESHOLD).astype(int),
    'majority_vote': test_majority,
    'stacking': (test_stack > THRESHOLD).astype(int),
    'simple_avg': (test_avg > THRESHOLD).astype(int),
}

for name, preds in submissions.items():
    sub = pd.DataFrame({
        'PassengerId': test_ids.astype(int),
        'Survived': preds.astype(int)
    })
    filepath = f'../submissions/submission_{name}.csv'
    sub.to_csv(filepath, index=False)
    print(f'{name}: {sub.Survived.mean():.3f} survival rate, saved to {filepath}')

print(f'\n{len(submissions)} submission files created.')

In [None]:
# Sanity check best submission
best_sub = pd.read_csv(f'../submissions/submission_weighted_avg.csv')
print('=== Best Submission Sanity Check ===')
print(f'Shape: {best_sub.shape}')
print(f'Expected shape: {sample_sub.shape}')
print(f'Columns match: {list(best_sub.columns) == list(sample_sub.columns)}')
print(f'PassengerId range: {best_sub.PassengerId.min()} - {best_sub.PassengerId.max()}')
print(f'Survived values: {best_sub.Survived.unique()}')
print(f'Survival rate: {best_sub.Survived.mean():.3f}')
print(f'\nFirst 10 rows:')
print(best_sub.head(10))

## Summary

### Pipeline Overview
1. **EDA** -> Key insights: Sex, Pclass, Title, Family Size
2. **Feature Engineering** -> 24 features from 11 original columns
3. **Modeling** -> 6 models with Optuna-tuned GBDT
4. **Ensemble** -> 5 ensemble strategies

### Next Steps for Higher Score
- Add name-based group survival features (women-children-first rule)
- Try neural network (small MLP)
- External data (deck plans, passenger lists)
- Pseudo-labeling with confident test predictions
- Manual rule adjustments for edge cases