# 🗳️ Titanic Survival Prediction - Ensemble Methods

## 📊 Mục tiêu
- Implement ensemble methods để cải thiện performance
- Sử dụng Voting Classifier và Stacking
- So sánh performance của ensemble vs individual models
- Tạo final model cho submission

## 📋 Nội dung
1. **Data Preparation**
2. **Voting Classifier**
3. **Stacking Classifier**
4. **Bagging Methods**
5. **Ensemble Performance Comparison**
6. **ưư an

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import lightgbm as lgb
import joblib
import os
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")
print("🎨 Visualization style set!")


## 1. 📥 Data Preparation


In [None]:
# Import preprocessing utilities
import sys
sys.path.append('../src')
from data_preprocessing import load_data, preprocess_data, prepare_features, get_feature_columns

# Load and preprocess data
train_df, test_df = load_data('../data/raw/train.csv', '../data/raw/test.csv')
processed_train_df, processed_test_df, label_encoders = preprocess_data(train_df, test_df)

# Prepare features
feature_columns = get_feature_columns()
X = prepare_features(processed_train_df, feature_columns)
y = processed_train_df['Survived']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"📊 Training set shape: {X_train.shape}")
print(f"📊 Test set shape: {X_test.shape}")
print(f"📊 Feature columns: {feature_columns}")

# Display feature info
print(f"\n🔍 Feature Information:")
print(f"Features: {list(X.columns)}")
print(f"Target distribution: {y.value_counts().to_dict()}")


## 2. 🗳️ Voting Classifier


In [None]:
# Voting Classifier Implementation
print("🗳️ VOTING CLASSIFIER")
print("=" * 50)

# Create base models
base_models = {
    'rf': RandomForestClassifier(n_estimators=100, random_state=42),
    'xgb': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    'lgb': lgb.LGBMClassifier(random_state=42, verbose=-1),
    'lr': LogisticRegression(random_state=42, max_iter=1000),
    'svm': SVC(random_state=42, probability=True)
}

# Hard Voting Classifier
print("🔧 Creating Hard Voting Classifier...")
hard_voting = VotingClassifier(
    estimators=list(base_models.items()),
    voting='hard'
)

# Soft Voting Classifier
print("🔧 Creating Soft Voting Classifier...")
soft_voting = VotingClassifier(
    estimators=list(base_models.items()),
    voting='soft'
)

# Train and evaluate hard voting
print("\n📊 Training Hard Voting Classifier...")
hard_voting.fit(X_train, y_train)
hard_pred = hard_voting.predict(X_test)
hard_accuracy = accuracy_score(y_test, hard_pred)

print(f"✅ Hard Voting Accuracy: {hard_accuracy:.4f}")

# Train and evaluate soft voting
print("\n📊 Training Soft Voting Classifier...")
soft_voting.fit(X_train, y_train)
soft_pred = soft_voting.predict(X_test)
soft_accuracy = accuracy_score(y_test, soft_pred)

print(f"✅ Soft Voting Accuracy: {soft_accuracy:.4f}")

# Store results
voting_results = {
    'Hard Voting': {
        'model': hard_voting,
        'accuracy': hard_accuracy,
        'predictions': hard_pred
    },
    'Soft Voting': {
        'model': soft_voting,
        'accuracy': soft_accuracy,
        'predictions': soft_pred
    }
}


## 3. 📚 Stacking Classifier


In [None]:
# Stacking Classifier Implementation
print("📚 STACKING CLASSIFIER")
print("=" * 50)

# Create stacking classifier with Logistic Regression as meta-learner
print("🔧 Creating Stacking Classifier...")
stacking_classifier = StackingClassifier(
    estimators=list(base_models.items()),
    final_estimator=LogisticRegression(random_state=42),
    cv=5,
    stack_method='predict_proba'
)

# Train and evaluate stacking
print("\n📊 Training Stacking Classifier...")
stacking_classifier.fit(X_train, y_train)
stacking_pred = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, stacking_pred)

print(f"✅ Stacking Accuracy: {stacking_accuracy:.4f}")

# Create another stacking with Random Forest as meta-learner
print("\n🔧 Creating Stacking with Random Forest Meta-learner...")
stacking_rf = StackingClassifier(
    estimators=list(base_models.items()),
    final_estimator=RandomForestClassifier(n_estimators=50, random_state=42),
    cv=5,
    stack_method='predict_proba'
)

# Train and evaluate stacking with RF
print("\n📊 Training Stacking with RF Meta-learner...")
stacking_rf.fit(X_train, y_train)
stacking_rf_pred = stacking_rf.predict(X_test)
stacking_rf_accuracy = accuracy_score(y_test, stacking_rf_pred)

print(f"✅ Stacking (RF Meta) Accuracy: {stacking_rf_accuracy:.4f}")

# Store results
stacking_results = {
    'Stacking (LR Meta)': {
        'model': stacking_classifier,
        'accuracy': stacking_accuracy,
        'predictions': stacking_pred
    },
    'Stacking (RF Meta)': {
        'model': stacking_rf,
        'accuracy': stacking_rf_accuracy,
        'predictions': stacking_rf_pred
    }
}


## 4. 🎒 Bagging Methods


In [None]:
# Bagging Methods Implementation
print("🎒 BAGGING METHODS")
print("=" * 50)

# Create bagging classifiers
print("🔧 Creating Bagging Classifiers...")

# Bagging with Decision Tree
bagging_dt = BaggingClassifier(
    base_estimator=RandomForestClassifier(n_estimators=10, random_state=42),
    n_estimators=50,
    random_state=42
)

# Bagging with Logistic Regression
bagging_lr = BaggingClassifier(
    base_estimator=LogisticRegression(random_state=42, max_iter=1000),
    n_estimators=50,
    random_state=42
)

# Train and evaluate bagging with Decision Tree
print("\n📊 Training Bagging with Decision Tree...")
bagging_dt.fit(X_train, y_train)
bagging_dt_pred = bagging_dt.predict(X_test)
bagging_dt_accuracy = accuracy_score(y_test, bagging_dt_pred)

print(f"✅ Bagging (DT) Accuracy: {bagging_dt_accuracy:.4f}")

# Train and evaluate bagging with Logistic Regression
print("\n📊 Training Bagging with Logistic Regression...")
bagging_lr.fit(X_train, y_train)
bagging_lr_pred = bagging_lr.predict(X_test)
bagging_lr_accuracy = accuracy_score(y_test, bagging_lr_pred)

print(f"✅ Bagging (LR) Accuracy: {bagging_lr_accuracy:.4f}")

# Store results
bagging_results = {
    'Bagging (DT)': {
        'model': bagging_dt,
        'accuracy': bagging_dt_accuracy,
        'predictions': bagging_dt_pred
    },
    'Bagging (LR)': {
        'model': bagging_lr,
        'accuracy': bagging_lr_accuracy,
        'predictions': bagging_lr_pred
    }
}


## 5. 📊 Ensemble Performance Comparison


In [None]:
# Compare all ensemble methods
print("📊 ENSEMBLE PERFORMANCE COMPARISON")
print("=" * 60)

# Train individual base models for comparison
print("🔧 Training individual base models...")
individual_results = {}
for name, model in base_models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    individual_results[name.upper()] = accuracy
    print(f"✅ {name.upper()} Accuracy: {accuracy:.4f}")

# Combine all results
all_results = {
    **individual_results,
    **{name: result['accuracy'] for name, result in voting_results.items()},
    **{name: result['accuracy'] for name, result in stacking_results.items()},
    **{name: result['accuracy'] for name, result in bagging_results.items()}
}

# Create comparison DataFrame
comparison_df = pd.DataFrame(list(all_results.items()), columns=['Model', 'Accuracy'])
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)

print(f"\n🏆 ENSEMBLE COMPARISON RESULTS:")
print("=" * 60)
print(comparison_df.to_string(index=False))

# Visualize comparison
plt.figure(figsize=(15, 8))
colors = ['lightcoral' if 'Voting' in model or 'Stacking' in model or 'Bagging' in model else 'skyblue' 
          for model in comparison_df['Model']]

bars = plt.bar(range(len(comparison_df)), comparison_df['Accuracy'], color=colors, alpha=0.7)
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Ensemble Methods vs Individual Models Performance', fontsize=14, fontweight='bold')
plt.xticks(range(len(comparison_df)), comparison_df['Model'], rotation=45, ha='right')
plt.grid(True, alpha=0.3)

# Add value labels on bars
for i, (bar, acc) in enumerate(zip(bars, comparison_df['Accuracy'])):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
             f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Find best ensemble method
ensemble_models = [name for name in all_results.keys() 
                  if any(keyword in name for keyword in ['Voting', 'Stacking', 'Bagging'])]
best_ensemble = max(ensemble_models, key=lambda x: all_results[x])
best_ensemble_accuracy = all_results[best_ensemble]

print(f"\n🥇 BEST ENSEMBLE METHOD: {best_ensemble}")
print(f"🎯 Best Ensemble Accuracy: {best_ensemble_accuracy:.4f}")

# Compare with best individual model
best_individual = max(individual_results.keys(), key=lambda x: individual_results[x])
best_individual_accuracy = individual_results[best_individual]

improvement = best_ensemble_accuracy - best_individual_accuracy
print(f"📈 Improvement over best individual ({best_individual}): {improvement:.4f}")

# Store all ensemble results
all_ensemble_results = {
    'individual': individual_results,
    'voting': voting_results,
    'stacking': stacking_results,
    'bagging': bagging_results,
    'best_ensemble': best_ensemble,
    'best_ensemble_accuracy': best_ensemble_accuracy
}


## 6. 💾 Save Best Ensemble Model


In [None]:
# Save best ensemble model
print("💾 SAVING BEST ENSEMBLE MODEL")
print("=" * 50)

# Create directories
os.makedirs('../models/ensemble_models', exist_ok=True)
os.makedirs('../reports/results', exist_ok=True)

# Get best ensemble model
if best_ensemble in voting_results:
    best_model = voting_results[best_ensemble]['model']
elif best_ensemble in stacking_results:
    best_model = stacking_results[best_ensemble]['model']
elif best_ensemble in bagging_results:
    best_model = bagging_results[best_ensemble]['model']

# Save best ensemble model
best_model_path = f"../models/ensemble_models/{best_ensemble.lower().replace(' ', '_')}_ensemble.pkl"
joblib.dump(best_model, best_model_path)
print(f"✅ Saved best ensemble model to {best_model_path}")

# Save all ensemble models
print("\n🔧 Saving all ensemble models...")
for category, results in [('voting', voting_results), ('stacking', stacking_results), ('bagging', bagging_results)]:
    for name, result in results.items():
        model_path = f"../models/ensemble_models/{category}_{name.lower().replace(' ', '_')}.pkl"
        joblib.dump(result['model'], model_path)
        print(f"✅ Saved {name} to {model_path}")

# Save ensemble results
import json
from datetime import datetime

ensemble_summary = {
    'timestamp': datetime.now().isoformat(),
    'best_ensemble': best_ensemble,
    'best_ensemble_accuracy': float(best_ensemble_accuracy),
    'improvement_over_individual': float(improvement),
    'all_results': {k: float(v) for k, v in all_results.items()},
    'comparison_summary': comparison_df.to_dict('records')
}

results_path = '../reports/results/ensemble_methods_results.json'
with open(results_path, 'w') as f:
    json.dump(ensemble_summary, f, indent=2)
print(f"✅ Saved ensemble results to {results_path}")

# Save comparison DataFrame
comparison_path = '../reports/results/ensemble_comparison.csv'
comparison_df.to_csv(comparison_path, index=False)
print(f"✅ Saved ensemble comparison to {comparison_path}")

print(f"\n🎯 ENSEMBLE METHODS COMPLETED!")
print(f"🏆 Best Ensemble: {best_ensemble}")
print(f"📊 Best Accuracy: {best_ensemble_accuracy:.4f}")
print(f"📈 Improvement: {improvement:.4f}")
print(f"💾 All models saved to ../models/ensemble_models/")
