# 🤖 Titanic Survival Prediction - Model Training

## 📊 Mục tiêu
- Train các baseline models
- So sánh performance của different algorithms
- Implement cross-validation
- Chuẩn bị cho hyperparameter tuning

## 📋 Nội dung
1. **Data Preparation**
2. **Baseline Models Training**
3. **Model Performance Comparison**
4. **Cross-Validation Analysis**
5. **Feature Importance Analysis**
6. **Model Selection**


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")
print("🎨 Visualization style set!")


## 1. 📥 Data Preparation


In [None]:
# Import preprocessing utilities
import sys
sys.path.append('../src')
from data_preprocessing import load_data, preprocess_data, prepare_features, get_feature_columns
from models import ModelTrainer, evaluate_model
from evaluation import evaluate_classification_model, plot_confusion_matrix, plot_roc_curve, compare_models

# Load and preprocess data
train_df, test_df = load_data('../data/raw/train.csv', '../data/raw/test.csv')
processed_train_df, processed_test_df, label_encoders = preprocess_data(train_df, test_df)

# Prepare features
feature_columns = get_feature_columns()
X = prepare_features(processed_train_df, feature_columns)
y = processed_train_df['Survived']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"📊 Training set shape: {X_train.shape}")
print(f"📊 Test set shape: {X_test.shape}")
print(f"📊 Feature columns: {feature_columns}")

# Display feature info
print(f"\n🔍 Feature Information:")
print(f"Features: {list(X.columns)}")
print(f"Target distribution: {y.value_counts().to_dict()}")


## 2. 🤖 Baseline Models Training


In [None]:
# Initialize model trainer
trainer = ModelTrainer(random_state=42)
trainer.initialize_models()

# Train all models
results = trainer.train_models(X_train, y_train, X_test, y_test)

# Display results summary
print("\n📊 MODEL PERFORMANCE SUMMARY:")
print("=" * 50)
for name, result in results.items():
    print(f"{name:20}: Accuracy = {result['accuracy']:.4f}, AUC = {result['auc']:.4f if result['auc'] else 'N/A'}")


## 3. 📊 Model Performance Comparison


In [None]:
# Comprehensive evaluation for each model
evaluation_results = {}

for name, result in results.items():
    print(f"\n{'='*60}")
    print(f"📊 Evaluating {name}")
    print(f"{'='*60}")
    
    # Evaluate model
    eval_result = evaluate_classification_model(
        y_test, result['predictions'], result['probabilities'], name
    )
    
    # Plot confusion matrix
    plot_confusion_matrix(y_test, result['predictions'], name)
    
    # Plot ROC curve if probabilities available
    if result['probabilities'] is not None:
        plot_roc_curve(y_test, result['probabilities'], name)
    
    # Store results
    evaluation_results[name] = eval_result

# Compare all models
print("\n🏆 MODEL COMPARISON:")
comparison_df = compare_models(evaluation_results, metric='accuracy')
