# 🎯 Titanic Survival Prediction - Hyperparameter Tuning

## 📊 Mục tiêu
- Tối ưu hóa hyperparameters cho các models tốt nhất
- Sử dụng GridSearchCV và RandomSearchCV
- So sánh performance trước và sau tuning
- Chọn best parameters cho production

## 📋 Nội dung
1. **Data Preparation**
2. **Random Forest Tuning**
3. **XGBoost Tuning**
4. **Logistic Regression Tuning**
5. **SVM Tuning**
6. **Model Comparison After Tuning**
7. **Best Model Selection**


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import joblib
import time
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")
print("🎨 Visualization style set!")


## 1. 📥 Data Preparation


In [None]:
# Import preprocessing utilities
import sys
sys.path.append('../src')
from data_preprocessing import load_data, preprocess_data, prepare_features, get_feature_columns
from models import ModelTrainer

# Load and preprocess data
train_df, test_df = load_data('../data/raw/train.csv', '../data/raw/test.csv')
processed_train_df, processed_test_df, label_encoders = preprocess_data(train_df, test_df)

# Prepare features
feature_columns = get_feature_columns()
X = prepare_features(processed_train_df, feature_columns)
y = processed_train_df['Survived']

# Split data for hyperparameter tuning
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"📊 Training set shape: {X_train.shape}")
print(f"📊 Test set shape: {X_test.shape}")
print(f"📊 Feature columns: {feature_columns}")

# Display feature info
print(f"\n🔍 Feature Information:")
print(f"Features: {list(X.columns)}")
print(f"Target distribution: {y.value_counts().to_dict()}")


## 2. 🌲 Random Forest Hyperparameter Tuning


In [None]:
# Random Forest Hyperparameter Tuning
print("🌲 Random Forest Hyperparameter Tuning")
print("=" * 50)

# Define parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# Use RandomizedSearchCV for faster search
rf_random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_grid,
    n_iter=50,  # Number of parameter settings sampled
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Fit the model
print("🔍 Searching for best Random Forest parameters...")
start_time = time.time()
rf_random_search.fit(X_train, y_train)
end_time = time.time()

print(f"⏱️ Search completed in {end_time - start_time:.2f} seconds")
print(f"🏆 Best parameters: {rf_random_search.best_params_}")
print(f"🎯 Best cross-validation score: {rf_random_search.best_score_:.4f}")

# Test on holdout set
rf_best = rf_random_search.best_estimator_
rf_y_pred = rf_best.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print(f"📊 Test accuracy: {rf_accuracy:.4f}")

# Store results
rf_results = {
    'best_params': rf_random_search.best_params_,
    'best_cv_score': rf_random_search.best_score_,
    'test_accuracy': rf_accuracy,
    'model': rf_best
}


## 3. 🚀 XGBoost Hyperparameter Tuning


In [None]:
# XGBoost Hyperparameter Tuning
print("🚀 XGBoost Hyperparameter Tuning")
print("=" * 50)

# Define parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 0.5, 1.0]
}

# Initialize XGBoost
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')

# Use RandomizedSearchCV for faster search
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_param_grid,
    n_iter=50,  # Number of parameter settings sampled
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Fit the model
print("🔍 Searching for best XGBoost parameters...")
start_time = time.time()
xgb_random_search.fit(X_train, y_train)
end_time = time.time()

print(f"⏱️ Search completed in {end_time - start_time:.2f} seconds")
print(f"🏆 Best parameters: {xgb_random_search.best_params_}")
print(f"🎯 Best cross-validation score: {xgb_random_search.best_score_:.4f}")

# Test on holdout set
xgb_best = xgb_random_search.best_estimator_
xgb_y_pred = xgb_best.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
print(f"📊 Test accuracy: {xgb_accuracy:.4f}")

# Store results
xgb_results = {
    'best_params': xgb_random_search.best_params_,
    'best_cv_score': xgb_random_search.best_score_,
    'test_accuracy': xgb_accuracy,
    'model': xgb_best
}


## 4. 📈 Logistic Regression Hyperparameter Tuning


In [None]:
# Logistic Regression Hyperparameter Tuning
print("📈 Logistic Regression Hyperparameter Tuning")
print("=" * 50)

# Define parameter grid for Logistic Regression
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'max_iter': [100, 500, 1000, 2000]
}

# Initialize Logistic Regression
lr = LogisticRegression(random_state=42)

# Use GridSearchCV for Logistic Regression (smaller parameter space)
lr_grid_search = GridSearchCV(
    estimator=lr,
    param_grid=lr_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the model
print("🔍 Searching for best Logistic Regression parameters...")
start_time = time.time()
lr_grid_search.fit(X_train, y_train)
end_time = time.time()

print(f"⏱️ Search completed in {end_time - start_time:.2f} seconds")
print(f"🏆 Best parameters: {lr_grid_search.best_params_}")
print(f"🎯 Best cross-validation score: {lr_grid_search.best_score_:.4f}")

# Test on holdout set
lr_best = lr_grid_search.best_estimator_
lr_y_pred = lr_best.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
print(f"📊 Test accuracy: {lr_accuracy:.4f}")

# Store results
lr_results = {
    'best_params': lr_grid_search.best_params_,
    'best_cv_score': lr_grid_search.best_score_,
    'test_accuracy': lr_accuracy,
    'model': lr_best
}


## 5. 🎯 SVM Hyperparameter Tuning


In [None]:
# SVM Hyperparameter Tuning
print("🎯 SVM Hyperparameter Tuning")
print("=" * 50)

# Define parameter grid for SVM
svm_param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'degree': [2, 3, 4, 5]  # Only used for poly kernel
}

# Initialize SVM
svm = SVC(random_state=42, probability=True)

# Use RandomizedSearchCV for SVM (can be slow)
svm_random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=svm_param_grid,
    n_iter=30,  # Fewer iterations due to SVM being slower
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Fit the model
print("🔍 Searching for best SVM parameters...")
start_time = time.time()
svm_random_search.fit(X_train, y_train)
end_time = time.time()

print(f"⏱️ Search completed in {end_time - start_time:.2f} seconds")
print(f"🏆 Best parameters: {svm_random_search.best_params_}")
print(f"🎯 Best cross-validation score: {svm_random_search.best_score_:.4f}")

# Test on holdout set
svm_best = svm_random_search.best_estimator_
svm_y_pred = svm_best.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print(f"📊 Test accuracy: {svm_accuracy:.4f}")

# Store results
svm_results = {
    'best_params': svm_random_search.best_params_,
    'best_cv_score': svm_random_search.best_score_,
    'test_accuracy': svm_accuracy,
    'model': svm_best
}


## 6. 📊 Model Comparison After Tuning


In [None]:
# Compare all tuned models
print("📊 MODEL COMPARISON AFTER HYPERPARAMETER TUNING")
print("=" * 60)

# Create comparison DataFrame
comparison_data = {
    'Model': ['Random Forest', 'XGBoost', 'Logistic Regression', 'SVM'],
    'Best CV Score': [
        rf_results['best_cv_score'],
        xgb_results['best_cv_score'],
        lr_results['best_cv_score'],
        svm_results['best_cv_score']
    ],
    'Test Accuracy': [
        rf_results['test_accuracy'],
        xgb_results['test_accuracy'],
        lr_results['test_accuracy'],
        svm_results['test_accuracy']
    ]
}

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Test Accuracy', ascending=False)

print("\n🏆 RANKING BY TEST ACCURACY:")
print(comparison_df.to_string(index=False))

# Visualize comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# CV Score comparison
ax1.bar(comparison_df['Model'], comparison_df['Best CV Score'], color='skyblue', alpha=0.7)
ax1.set_title('Cross-Validation Scores After Tuning', fontsize=14, fontweight='bold')
ax1.set_ylabel('CV Score')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(True, alpha=0.3)

# Test Accuracy comparison
ax2.bar(comparison_df['Model'], comparison_df['Test Accuracy'], color='lightcoral', alpha=0.7)
ax2.set_title('Test Accuracy After Tuning', fontsize=14, fontweight='bold')
ax2.set_ylabel('Test Accuracy')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find best model
best_model_name = comparison_df.iloc[0]['Model']
best_accuracy = comparison_df.iloc[0]['Test Accuracy']

print(f"\n🥇 BEST MODEL: {best_model_name}")
print(f"🎯 Best Test Accuracy: {best_accuracy:.4f}")

# Store all results
all_tuned_results = {
    'Random Forest': rf_results,
    'XGBoost': xgb_results,
    'Logistic Regression': lr_results,
    'SVM': svm_results
}


## 7. 📈 Before vs After Tuning Comparison


In [None]:
# Compare with baseline models (from previous notebook)
print("📈 BEFORE vs AFTER HYPERPARAMETER TUNING COMPARISON")
print("=" * 60)

# Train baseline models for comparison
baseline_models = {
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

baseline_results = {}
for name, model in baseline_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    baseline_results[name] = accuracy
    print(f"📊 {name} Baseline Accuracy: {accuracy:.4f}")

print("\n" + "="*60)

# Create comparison DataFrame
before_after_data = {
    'Model': ['Random Forest', 'XGBoost', 'Logistic Regression', 'SVM'],
    'Before Tuning': [
        baseline_results['Random Forest'],
        baseline_results['XGBoost'],
        baseline_results['Logistic Regression'],
        baseline_results['SVM']
    ],
    'After Tuning': [
        rf_results['test_accuracy'],
        xgb_results['test_accuracy'],
        lr_results['test_accuracy'],
        svm_results['test_accuracy']
    ]
}

before_after_df = pd.DataFrame(before_after_data)
before_after_df['Improvement'] = before_after_df['After Tuning'] - before_after_df['Before Tuning']

print("\n📊 BEFORE vs AFTER TUNING COMPARISON:")
print(before_after_df.to_string(index=False))

# Visualize improvement
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(before_after_df))
width = 0.35

bars1 = ax.bar(x - width/2, before_after_df['Before Tuning'], width, label='Before Tuning', alpha=0.7, color='lightblue')
bars2 = ax.bar(x + width/2, before_after_df['After Tuning'], width, label='After Tuning', alpha=0.7, color='lightcoral')

ax.set_xlabel('Models')
ax.set_ylabel('Test Accuracy')
ax.set_title('Model Performance: Before vs After Hyperparameter Tuning', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(before_after_df['Model'], rotation=45)
ax.legend()
ax.grid(True, alpha=0.3)

# Add improvement annotations
for i, (before, after, improvement) in enumerate(zip(before_after_df['Before Tuning'], 
                                                   before_after_df['After Tuning'], 
                                                   before_after_df['Improvement'])):
    ax.annotate(f'+{improvement:.3f}', 
                xy=(i + width/2, after), 
                xytext=(0, 10), 
                textcoords='offset points',
                ha='center', 
                fontweight='bold',
                color='green' if improvement > 0 else 'red')

plt.tight_layout()
plt.show()

# Summary of improvements
total_improvement = before_after_df['Improvement'].sum()
avg_improvement = before_after_df['Improvement'].mean()
best_improvement = before_after_df['Improvement'].max()
best_improved_model = before_after_df.loc[before_after_df['Improvement'].idxmax(), 'Model']

print(f"\n📈 TUNING SUMMARY:")
print(f"🎯 Total Improvement: {total_improvement:.4f}")
print(f"📊 Average Improvement: {avg_improvement:.4f}")
print(f"🏆 Best Improvement: {best_improvement:.4f} ({best_improved_model})")
print(f"📈 Models with positive improvement: {(before_after_df['Improvement'] > 0).sum()}/4")


## 8. 💾 Save Best Models and Results


In [None]:
# Save best models and results
print("💾 SAVING BEST MODELS AND RESULTS")
print("=" * 50)

import os
import json
from datetime import datetime

# Create directories
os.makedirs('../models/tuned_models', exist_ok=True)
os.makedirs('../reports/results', exist_ok=True)

# Save all tuned models
print("🔧 Saving tuned models...")
for model_name, results in all_tuned_results.items():
    model_path = f"../models/tuned_models/{model_name.lower().replace(' ', '_')}_tuned.pkl"
    joblib.dump(results['model'], model_path)
    print(f"✅ Saved {model_name} to {model_path}")

# Save hyperparameter results
print("\n📊 Saving hyperparameter results...")
tuning_results = {
    'timestamp': datetime.now().isoformat(),
    'best_models': {
        model_name: {
            'best_params': results['best_params'],
            'best_cv_score': float(results['best_cv_score']),
            'test_accuracy': float(results['test_accuracy'])
        }
        for model_name, results in all_tuned_results.items()
    },
    'comparison': {
        'best_model': best_model_name,
        'best_accuracy': float(best_accuracy),
        'before_after_comparison': before_after_df.to_dict('records')
    }
}

# Save to JSON
results_path = '../reports/results/hyperparameter_tuning_results.json'
with open(results_path, 'w') as f:
    json.dump(tuning_results, f, indent=2)
print(f"✅ Saved tuning results to {results_path}")

# Save comparison DataFrame
comparison_path = '../reports/results/model_comparison_after_tuning.csv'
comparison_df.to_csv(comparison_path, index=False)
print(f"✅ Saved model comparison to {comparison_path}")

# Save before/after comparison
before_after_path = '../reports/results/before_after_tuning_comparison.csv'
before_after_df.to_csv(before_after_path, index=False)
print(f"✅ Saved before/after comparison to {before_after_path}")

print(f"\n🎯 HYPERPARAMETER TUNING COMPLETED!")
print(f"🏆 Best Model: {best_model_name}")
print(f"📊 Best Accuracy: {best_accuracy:.4f}")
print(f"📈 Total Improvement: {total_improvement:.4f}")
print(f"💾 All results saved to ../reports/results/")


## 9. 📋 Key Insights and Conclusions

### 🎯 **Hyperparameter Tuning Results Summary**

#### **Best Performing Models:**
1. **🥇 Best Model**: [Will be determined after running]
2. **📊 Best Accuracy**: [Will be determined after running]
3. **📈 Improvement**: [Will be determined after running]

#### **Key Findings:**
- **Random Forest**: Tuning typically improves performance by optimizing tree depth, number of estimators, and feature selection
- **XGBoost**: Learning rate and regularization parameters are crucial for preventing overfitting
- **Logistic Regression**: Regularization strength (C) and penalty type significantly affect performance
- **SVM**: Kernel selection and gamma parameter are key for non-linear decision boundaries

#### **Hyperparameter Importance:**
- **Tree-based models** (RF, XGBoost): `max_depth`, `n_estimators`, `learning_rate`
- **Linear models** (Logistic Regression): `C`, `penalty`, `solver`
- **SVM**: `C`, `gamma`, `kernel`

#### **Best Practices Applied:**
- ✅ Used **RandomizedSearchCV** for faster exploration of large parameter spaces
- ✅ Used **GridSearchCV** for smaller, discrete parameter spaces
- ✅ Applied **5-fold cross-validation** for robust evaluation
- ✅ Compared **before vs after** tuning to measure improvement
- ✅ Saved all results for reproducibility

#### **Next Steps:**
1. **Ensemble Methods**: Combine best models using voting or stacking
2. **Feature Engineering**: Further optimize features based on model insights
3. **Final Submission**: Use best tuned model for Kaggle submission
4. **Model Deployment**: Prepare best model for production use


In [None]:
# Feature Importance Analysis for Best Model
print("🔍 FEATURE IMPORTANCE ANALYSIS")
print("=" * 50)

# Get the best model
best_model = all_tuned_results[best_model_name]['model']

# Check if model has feature_importances_ attribute
if hasattr(best_model, 'feature_importances_'):
    # Get feature importance
    feature_importance = best_model.feature_importances_
    feature_names = X.columns
    
    # Create DataFrame
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    print(f"🏆 Feature Importance for {best_model_name}:")
    print(importance_df.to_string(index=False))
    
    # Visualize feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['feature'], importance_df['importance'], color='skyblue', alpha=0.7)
    plt.xlabel('Feature Importance')
    plt.title(f'Feature Importance - {best_model_name}', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
elif hasattr(best_model, 'coef_'):
    # For linear models, show coefficients
    coef = best_model.coef_[0]
    feature_names = X.columns
    
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coef
    }).sort_values('coefficient', key=abs, ascending=False)
    
    print(f"🏆 Feature Coefficients for {best_model_name}:")
    print(coef_df.to_string(index=False))
    
    # Visualize coefficients
    plt.figure(figsize=(10, 6))
    colors = ['red' if x < 0 else 'blue' for x in coef_df['coefficient']]
    plt.barh(coef_df['feature'], coef_df['coefficient'], color=colors, alpha=0.7)
    plt.xlabel('Coefficient Value')
    plt.title(f'Feature Coefficients - {best_model_name}', fontsize=14, fontweight='bold')
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
else:
    print(f"⚠️ {best_model_name} does not support feature importance analysis")

print(f"\n✅ Hyperparameter tuning notebook completed successfully!")
print(f"📊 Ready for ensemble methods and final submission!")
