# 📄 Titanic Survival Prediction - Submission Preparation

## 📊 Mục tiêu
- Chuẩn bị final predictions cho Kaggle submission
- Sử dụng best model đã được tuned
- Tạo submission file theo format Kaggle
- Validate predictions trước khi submit

## 📋 Nội dung
1. **Load Best Model**
2. **Prepare Test Data**
3. **Generate Predictions**
4. **Create Submission File**
5. **Validation & Final Checks**


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import os
import json
from datetime import datetime
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")
print("🎨 Visualization style set!")


## 1. 📂 Load Best Model


In [None]:
# Load best model from ensemble results
print("📂 LOADING BEST MODEL")
print("=" * 50)

# Check for ensemble results
ensemble_results_path = '../reports/results/ensemble_methods_results.json'
tuning_results_path = '../reports/results/hyperparameter_tuning_results.json'

best_model = None
best_model_name = None
model_source = None

# Try to load ensemble results first
if os.path.exists(ensemble_results_path):
    print("🔍 Found ensemble results, loading best ensemble model...")
    with open(ensemble_results_path, 'r') as f:
        ensemble_data = json.load(f)
    
    best_model_name = ensemble_data['best_ensemble']
    best_model_path = f"../models/ensemble_models/{best_model_name.lower().replace(' ', '_')}_ensemble.pkl"
    
    if os.path.exists(best_model_path):
        best_model = joblib.load(best_model_path)
        model_source = "ensemble"
        print(f"✅ Loaded best ensemble model: {best_model_name}")
    else:
        print("⚠️ Ensemble model file not found, trying alternative paths...")
        # Try alternative naming
        for file in os.listdir('../models/ensemble_models/'):
            if best_model_name.lower().replace(' ', '_') in file.lower():
                best_model = joblib.load(f"../models/ensemble_models/{file}")
                model_source = "ensemble"
                print(f"✅ Loaded ensemble model from: {file}")
                break

# If no ensemble model, try tuned models
if best_model is None and os.path.exists(tuning_results_path):
    print("🔍 Loading best tuned model...")
    with open(tuning_results_path, 'r') as f:
        tuning_data = json.load(f)
    
    best_model_name = tuning_data['comparison']['best_model']
    best_model_path = f"../models/tuned_models/{best_model_name.lower().replace(' ', '_')}_tuned.pkl"
    
    if os.path.exists(best_model_path):
        best_model = joblib.load(best_model_path)
        model_source = "tuned"
        print(f"✅ Loaded best tuned model: {best_model_name}")
    else:
        print("⚠️ Tuned model file not found, trying alternative paths...")
        # Try alternative naming
        for file in os.listdir('../models/tuned_models/'):
            if best_model_name.lower().replace(' ', '_') in file.lower():
                best_model = joblib.load(f"../models/tuned_models/{file}")
                model_source = "tuned"
                print(f"✅ Loaded tuned model from: {file}")
                break

# If still no model, create a default one
if best_model is None:
    print("⚠️ No saved models found, creating default Random Forest model...")
    from sklearn.ensemble import RandomForestClassifier
    best_model = RandomForestClassifier(n_estimators=100, random_state=42)
    best_model_name = "Random Forest (Default)"
    model_source = "default"

print(f"\n📊 Model Information:")
print(f"🏆 Best Model: {best_model_name}")
print(f"📂 Source: {model_source}")
print(f"🔧 Model Type: {type(best_model).__name__}")


## 2. 📊 Prepare Test Data


In [None]:
# Prepare test data for prediction
print("📊 PREPARING TEST DATA")
print("=" * 50)

# Import preprocessing utilities
import sys
sys.path.append('../src')
from data_preprocessing import load_data, preprocess_data, prepare_features, get_feature_columns

# Load and preprocess data
train_df, test_df = load_data('../data/raw/train.csv', '../data/raw/test.csv')
processed_train_df, processed_test_df, label_encoders = preprocess_data(train_df, test_df)

# Prepare features for test set
feature_columns = get_feature_columns()
X_test_submission = prepare_features(processed_test_df, feature_columns)

print(f"📊 Test set shape: {X_test_submission.shape}")
print(f"📊 Feature columns: {feature_columns}")
print(f"📊 Test set features: {list(X_test_submission.columns)}")

# Check for missing values
missing_values = X_test_submission.isnull().sum()
if missing_values.sum() > 0:
    print(f"\n⚠️ Missing values found:")
    print(missing_values[missing_values > 0])
    # Fill missing values with median for numeric columns
    for col in X_test_submission.select_dtypes(include=[np.number]).columns:
        if X_test_submission[col].isnull().sum() > 0:
            median_val = X_test_submission[col].median()
            X_test_submission[col].fillna(median_val, inplace=True)
            print(f"✅ Filled missing values in {col} with median: {median_val}")
else:
    print("✅ No missing values found in test set")

# Get passenger IDs for submission
passenger_ids = test_df['PassengerId'].values
print(f"\n📋 Passenger IDs: {len(passenger_ids)} passengers")
print(f"📋 ID range: {passenger_ids.min()} - {passenger_ids.max()}")


## 3. 🎯 Generate Predictions


In [None]:
# Generate predictions using best model
print("🎯 GENERATING PREDICTIONS")
print("=" * 50)

# If model is not trained, train it first
if not hasattr(best_model, 'predict') or model_source == "default":
    print("🔧 Training model on full training data...")
    X_train_full = prepare_features(processed_train_df, feature_columns)
    y_train_full = processed_train_df['Survived']
    
    # Handle missing values in training data
    for col in X_train_full.select_dtypes(include=[np.number]).columns:
        if X_train_full[col].isnull().sum() > 0:
            median_val = X_train_full[col].median()
            X_train_full[col].fillna(median_val, inplace=True)
    
    best_model.fit(X_train_full, y_train_full)
    print("✅ Model trained successfully!")

# Generate predictions
print("\n🔮 Generating predictions...")
predictions = best_model.predict(X_test_submission)
prediction_proba = best_model.predict_proba(X_test_submission)[:, 1] if hasattr(best_model, 'predict_proba') else None

print(f"✅ Generated {len(predictions)} predictions")
print(f"📊 Prediction distribution:")
unique, counts = np.unique(predictions, return_counts=True)
for val, count in zip(unique, counts):
    print(f"   Survived = {val}: {count} passengers ({count/len(predictions)*100:.1f}%)")

if prediction_proba is not None:
    print(f"\n📊 Probability statistics:")
    print(f"   Mean probability: {prediction_proba.mean():.4f}")
    print(f"   Min probability: {prediction_proba.min():.4f}")
    print(f"   Max probability: {prediction_proba.max():.4f}")
    print(f"   Std probability: {prediction_proba.std():.4f}")

# Show some example predictions
print(f"\n🔍 Sample predictions:")
sample_indices = np.random.choice(len(predictions), min(10, len(predictions)), replace=False)
for i, idx in enumerate(sample_indices):
    prob_str = f" (prob: {prediction_proba[idx]:.3f})" if prediction_proba is not None else ""
    print(f"   Passenger {passenger_ids[idx]}: Survived = {predictions[idx]}{prob_str}")


## 4. 📄 Create Submission File


In [None]:
# Create submission file
print("📄 CREATING SUBMISSION FILE")
print("=" * 50)

# Create submissions directory
os.makedirs('../submissions', exist_ok=True)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': predictions
})

# Generate timestamp for unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
submission_filename = f"../submissions/titanic_submission_{timestamp}.csv"

# Save submission file
submission_df.to_csv(submission_filename, index=False)
print(f"✅ Submission file created: {submission_filename}")

# Also create a latest submission file
latest_submission_filename = "../submissions/titanic_submission_latest.csv"
submission_df.to_csv(latest_submission_filename, index=False)
print(f"✅ Latest submission file created: {latest_submission_filename}")

# Display submission file info
print(f"\n📊 Submission File Information:")
print(f"📁 File: {submission_filename}")
print(f"📏 Shape: {submission_df.shape}")
print(f"📋 Columns: {list(submission_df.columns)}")

# Show first few rows
print(f"\n🔍 First 10 rows of submission:")
print(submission_df.head(10).to_string(index=False))

# Show last few rows
print(f"\n🔍 Last 10 rows of submission:")
print(submission_df.tail(10).to_string(index=False))

# Validate submission format
print(f"\n✅ Submission Validation:")
print(f"   ✓ Correct number of predictions: {len(predictions) == len(passenger_ids)}")
print(f"   ✓ All predictions are 0 or 1: {all(pred in [0, 1] for pred in predictions)}")
print(f"   ✓ No missing values: {not submission_df.isnull().any().any()}")
print(f"   ✓ PassengerId range: {passenger_ids.min()} - {passenger_ids.max()}")
print(f"   ✓ File size: {os.path.getsize(submission_filename)} bytes")


## 5. ✅ Validation & Final Checks


In [None]:
# Final validation and checks
print("✅ VALIDATION & FINAL CHECKS")
print("=" * 50)

# Compare with sample submission
sample_submission_path = '../data/raw/gender_submission.csv'
if os.path.exists(sample_submission_path):
    print("🔍 Comparing with sample submission...")
    sample_submission = pd.read_csv(sample_submission_path)
    
    print(f"📊 Sample submission shape: {sample_submission.shape}")
    print(f"📊 Our submission shape: {submission_df.shape}")
    
    # Check if shapes match
    if sample_submission.shape == submission_df.shape:
        print("✅ Submission shapes match!")
    else:
        print("⚠️ Submission shapes don't match!")
    
    # Check column names
    if list(sample_submission.columns) == list(submission_df.columns):
        print("✅ Column names match!")
    else:
        print("⚠️ Column names don't match!")
        print(f"   Sample: {list(sample_submission.columns)}")
        print(f"   Ours: {list(submission_df.columns)}")

# Create summary report
print(f"\n📋 SUBMISSION SUMMARY REPORT")
print("=" * 50)

submission_summary = {
    'timestamp': datetime.now().isoformat(),
    'model_info': {
        'name': best_model_name,
        'type': type(best_model).__name__,
        'source': model_source
    },
    'data_info': {
        'test_set_size': len(predictions),
        'features_used': len(feature_columns),
        'feature_list': feature_columns
    },
    'predictions': {
        'total_predictions': len(predictions),
        'survived_0': int(np.sum(predictions == 0)),
        'survived_1': int(np.sum(predictions == 1)),
        'survival_rate': float(np.mean(predictions))
    },
    'files': {
        'submission_file': submission_filename,
        'latest_file': latest_submission_filename
    }
}

# Save summary report
summary_path = '../reports/results/submission_summary.json'
os.makedirs('../reports/results', exist_ok=True)
with open(summary_path, 'w') as f:
    json.dump(submission_summary, f, indent=2)
print(f"✅ Summary report saved: {summary_path}")

# Display summary
print(f"\n📊 FINAL SUBMISSION SUMMARY:")
print(f"🏆 Model: {best_model_name}")
print(f"📂 Source: {model_source}")
print(f"📊 Test Set: {len(predictions)} passengers")
print(f"🔧 Features: {len(feature_columns)} features")
print(f"📈 Survival Rate: {np.mean(predictions):.1%}")
print(f"📄 Files Created:")
print(f"   • {submission_filename}")
print(f"   • {latest_submission_filename}")
print(f"   • {summary_path}")

# Final validation checklist
print(f"\n✅ FINAL VALIDATION CHECKLIST:")
print(f"   ✓ Model loaded successfully")
print(f"   ✓ Test data prepared correctly")
print(f"   ✓ Predictions generated")
print(f"   ✓ Submission file created")
print(f"   ✓ File format validated")
print(f"   ✓ No missing values")
print(f"   ✓ All predictions are 0 or 1")
print(f"   ✓ Correct number of predictions")

print(f"\n🎯 SUBMISSION READY FOR KAGGLE!")
print(f"📤 Upload file: {latest_submission_filename}")
print(f"🏆 Good luck with your submission!")
