# 🤖 Titanic Survival Prediction - Model Training

## 📊 Mục tiêu
- Train các baseline models
- So sánh performance của different algorithms
- Implement cross-validation
- Chuẩn bị cho hyperparameter tuning

## 📋 Nội dung
1. **Data Preparation**
2. **Baseline Models Training**
3. **Model Performance Comparison**
4. **Cross-Validation Analysis**
5. **Feature Importance Analysis**
6. **Model Selection**


In [6]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Import XGBoost và LightGBM (optional - sẽ skip nếu không có)
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
    print("✅ XGBoost version:", xgb.__version__)
except ImportError:
    XGBOOST_AVAILABLE = False
    print("⚠️ XGBoost không có sẵn - sẽ bỏ qua XGBoost models")

try:
    import lightgbm as lgb
    LIGHTGBM_AVAILABLE = True
    print("✅ LightGBM version:", lgb.__version__)
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("⚠️ LightGBM không có sẵn - sẽ bỏ qua LightGBM models")

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")
print("🎨 Visualization style set!")


⚠️ XGBoost không có sẵn - sẽ bỏ qua XGBoost models
⚠️ LightGBM không có sẵn - sẽ bỏ qua LightGBM models
📚 Libraries imported successfully!
🎨 Visualization style set!


## 1. 📥 Data Preparation


In [8]:
# Import preprocessing utilities
import sys
sys.path.append('../src')

# Import với error handling để tránh lỗi module không tìm thấy
try:
    from data_preprocessing import load_data, preprocess_data, prepare_features, get_feature_columns
    print("✅ Data preprocessing modules imported successfully!")
except ImportError as e:
    print(f"❌ Error importing data preprocessing: {e}")
    raise

try:
    from models import ModelTrainer, evaluate_model
    print("✅ Model modules imported successfully!")
except ImportError as e:
    print(f"⚠️ Error importing models: {e}")
    print("💡 Có thể do thiếu XGBoost hoặc LightGBM. Sẽ tạo ModelTrainer đơn giản...")
    
    # Tạo ModelTrainer đơn giản nếu không import được
    class SimpleModelTrainer:
        def __init__(self, random_state=42):
            self.random_state = random_state
            self.models = {}
            
        def initialize_models(self):
            from sklearn.linear_model import LogisticRegression
            from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
            from sklearn.svm import SVC
            from sklearn.neighbors import KNeighborsClassifier
            from sklearn.naive_bayes import GaussianNB
            from sklearn.tree import DecisionTreeClassifier
            
            self.models = {
                'Logistic Regression': LogisticRegression(random_state=self.random_state, max_iter=1000),
                'Random Forest': RandomForestClassifier(random_state=self.random_state, n_estimators=100),
                'Gradient Boosting': GradientBoostingClassifier(random_state=self.random_state),
                'SVM': SVC(random_state=self.random_state, probability=True),
                'KNN': KNeighborsClassifier(),
                'Naive Bayes': GaussianNB(),
                'Decision Tree': DecisionTreeClassifier(random_state=self.random_state)
            }
            
        def train_models(self, X_train, y_train, X_test, y_test):
            results = {}
            for name, model in self.models.items():
                print(f"🔄 Training {name}...")
                model.fit(X_train, y_train)
                predictions = model.predict(X_test)
                probabilities = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
                accuracy = accuracy_score(y_test, predictions)
                auc = roc_auc_score(y_test, probabilities) if probabilities is not None else None
                
                results[name] = {
                    'model': model,
                    'predictions': predictions,
                    'probabilities': probabilities,
                    'accuracy': accuracy,
                    'auc': auc
                }
            return results
    
    ModelTrainer = SimpleModelTrainer

try:
    from evaluation import evaluate_classification_model, plot_confusion_matrix, plot_roc_curve, compare_models
    print("✅ Evaluation modules imported successfully!")
except ImportError as e:
    print(f"❌ Error importing evaluation: {e}")
    raise

# Load and preprocess data
print("\n📥 Loading and preprocessing data...")
train_df, test_df = load_data('../data/raw/train.csv', '../data/raw/test.csv')
processed_train_df, processed_test_df, label_encoders = preprocess_data(train_df, test_df)

# Prepare features
feature_columns = get_feature_columns()
X = prepare_features(processed_train_df, feature_columns)
y = processed_train_df['Survived']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"📊 Training set shape: {X_train.shape}")
print(f"📊 Test set shape: {X_test.shape}")
print(f"📊 Feature columns: {feature_columns}")

# Display feature info
print(f"\n🔍 Feature Information:")
print(f"Features: {list(X.columns)}")
print(f"Target distribution: {y.value_counts().to_dict()}")


✅ Data preprocessing modules imported successfully!
⚠️ Error importing models: No module named 'xgboost'
💡 Có thể do thiếu XGBoost hoặc LightGBM. Sẽ tạo ModelTrainer đơn giản...
✅ Evaluation modules imported successfully!

📥 Loading and preprocessing data...
📊 Training set shape: (891, 12)
📊 Test set shape: (418, 11)
✅ Data preprocessing completed!
📊 Processed training set shape: (891, 22)
📊 Processed test set shape: (418, 22)
📊 Training set shape: (712, 15)
📊 Test set shape: (179, 15)
📊 Feature columns: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'TitleGroup', 'FamilySize', 'IsAlone', 'FamilySizeGroup', 'CabinDeck', 'HasCabin', 'AgeGroup', 'FareGroup']

🔍 Feature Information:
Features: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'TitleGroup', 'FamilySize', 'IsAlone', 'FamilySizeGroup', 'CabinDeck', 'HasCabin', 'AgeGroup', 'FareGroup']
Target distribution: {0.0: 549, 1.0: 342}


## 2. 🤖 Baseline Models Training


In [9]:
# Initialize model trainer
trainer = ModelTrainer(random_state=42)
trainer.initialize_models()

# Train all models
results = trainer.train_models(X_train, y_train, X_test, y_test)

# Display results summary
print("\n📊 MODEL PERFORMANCE SUMMARY:")
print("=" * 50)
for name, result in results.items():
    print(f"{name:20}: Accuracy = {result['accuracy']:.4f}, AUC = {result['auc']:.4f if result['auc'] else 'N/A'}")


🔄 Training Logistic Regression...
🔄 Training Random Forest...
🔄 Training Gradient Boosting...
🔄 Training SVM...
🔄 Training KNN...
🔄 Training Naive Bayes...
🔄 Training Decision Tree...

📊 MODEL PERFORMANCE SUMMARY:


ValueError: Invalid format specifier '.4f if result['auc'] else 'N/A'' for object of type 'float'

## 3. 📊 Model Performance Comparison


In [None]:
# Comprehensive evaluation for each model
evaluation_results = {}

for name, result in results.items():
    print(f"\n{'='*60}")
    print(f"📊 Evaluating {name}")
    print(f"{'='*60}")
    
    # Evaluate model
    eval_result = evaluate_classification_model(
        y_test, result['predictions'], result['probabilities'], name
    )
    
    # Plot confusion matrix
    plot_confusion_matrix(y_test, result['predictions'], name)
    
    # Plot ROC curve if probabilities available
    if result['probabilities'] is not None:
        plot_roc_curve(y_test, result['probabilities'], name)
    
    # Store results
    evaluation_results[name] = eval_result

# Compare all models
print("\n🏆 MODEL COMPARISON:")
comparison_df = compare_models(evaluation_results, metric='accuracy')
