# USG Failure Prediction - Model Training & Optimization

**Objective:** Train and optimize XGBoost model with ensemble methods

**Techniques:**
- Hyperparameter tuning with Optuna (50+ trials)
- SMOTE for class imbalance
- Ensemble with Random Forest & LightGBM
- Probability calibration
- Cross-validation

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib
import json

# Import custom modules
from model import USGFailurePredictionModel, XGBoostOptimizer
from evaluation import ModelEvaluator
from preprocessing import USGPreprocessingPipeline

# ML libraries
from sklearn.model_selection import train_test_split

# Configure
np.random.seed(42)
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print(f"Model Training started: {datetime.now()}")

## 1. Load Processed Data

In [None]:
# Option 1: Load pre-processed data if available
try:
    X = pd.read_csv('../data/processed/X_processed.csv')
    y = pd.read_csv('../data/processed/y_target.csv').squeeze()
    print(f"✓ Loaded processed data: X={X.shape}, y={y.shape}")
    preprocessor = joblib.load('../models/preprocessor.pkl')
    print("✓ Loaded preprocessor")
    
except FileNotFoundError:
    # Option 2: Load raw data and preprocess
    print("Processed data not found. Loading and preprocessing raw data...")
    
    df = pd.read_csv('../data/raw/USG_Data_cleared.csv')
    
    if 'Warranty_Claim' in df.columns:
        X_raw = df.drop('Warranty_Claim', axis=1)
        y = df['Warranty_Claim']
        
        # Preprocess
        preprocessor = USGPreprocessingPipeline(seed=42)
        X = preprocessor.fit_transform(X_raw, y)
        
        # Save
        X.to_csv('../data/processed/X_processed.csv', index=False)
        y.to_csv('../data/processed/y_target.csv', index=False)
        joblib.dump(preprocessor, '../models/preprocessor.pkl')
        
        print(f"✓ Data processed and saved: X={X.shape}, y={y.shape}")

# Display target distribution
print("\nTarget Distribution:")
print(y.value_counts())
print(f"Failure rate: {(y == 'Yes').mean() * 100:.2f}%")

## 2. Train-Test Split

In [None]:
# Stratified split to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTrain target distribution:")
print(y_train.value_counts())
print(f"\nTest target distribution:")
print(y_test.value_counts())

## 3. Hyperparameter Optimization with Optuna

In [None]:
# Initialize optimizer
optimizer = XGBoostOptimizer(
    n_trials=50,  # Increase for better results (100+)
    cv_folds=5,
    seed=42,
    use_smote=True
)

print("Starting hyperparameter optimization...")
print("This may take 3-5 minutes...\n")

# Run optimization
best_params = optimizer.optimize(X_train, y_train)

print("\n" + "="*60)
print("OPTIMAL HYPERPARAMETERS")
print("="*60)
for param, value in best_params.items():
    print(f"{param:20s}: {value}")
print("="*60)
print(f"Best CV F1 Score: {optimizer.study.best_value:.4f}")
print("="*60)

In [None]:
# Visualize optimization history
import optuna.visualization as vis

fig = vis.plot_optimization_history(optimizer.study)
fig.write_html('../reports/visualizations/optuna_optimization_history.html')
fig.show()

print("✓ Optimization history saved")

## 4. Train Full Model with Ensemble

In [None]:
# Initialize model with best parameters
model = USGFailurePredictionModel(
    optimize_hyperparams=False,  # Use pre-optimized params
    use_ensemble=True,
    use_calibration=True,
    seed=42
)

# Set best parameters
model.best_params = best_params

# Train
print("Training ensemble model...")
model.fit(X_train, y_train)

print(f"\n✓ Model training complete in {model.training_time:.2f} seconds")

## 5. Model Evaluation

In [None]:
# Initialize evaluator
evaluator = ModelEvaluator(seed=42)

# Comprehensive evaluation
print("Running comprehensive evaluation...\n")

evaluation_results = evaluator.evaluate(
    model=model,
    X_test=X_test,
    y_test=y_test,
    X_train=X_train,
    y_train=y_train,
    run_stress_tests=True,
    run_fairness_check=True
)

In [None]:
# Display key metrics
test_metrics = evaluation_results['test_metrics']

print("\n" + "="*60)
print("TEST SET PERFORMANCE")
print("="*60)
print(f"F1 Score:      {test_metrics['f1_score']:.4f}")
print(f"Precision:     {test_metrics['precision']:.4f}")
print(f"Recall:        {test_metrics['recall']:.4f}")
print(f"ROC-AUC:       {test_metrics['roc_auc']:.4f}")
print(f"PR-AUC:        {test_metrics['pr_auc']:.4f}")
print("="*60)

# Confusion matrix
print("\nConfusion Matrix:")
print(f"  TP: {test_metrics['true_positives']:4d}  |  FP: {test_metrics['false_positives']:4d}")
print(f"  FN: {test_metrics['false_negatives']:4d}  |  TN: {test_metrics['true_negatives']:4d}")
print(f"\nBusiness Cost: ${test_metrics['business_cost']:,.2f}")

In [None]:
# Cross-validation results
if 'cv_results' in evaluation_results:
    cv = evaluation_results['cv_results']
    
    print("\n" + "="*60)
    print("CROSS-VALIDATION RESULTS (5-Fold)")
    print("="*60)
    print(f"F1:        {cv['f1_mean']:.4f} (+/- {cv['f1_std']:.4f})")
    print(f"Precision: {cv['precision_mean']:.4f} (+/- {cv['precision_std']:.4f})")
    print(f"Recall:    {cv['recall_mean']:.4f} (+/- {cv['recall_std']:.4f})")
    print(f"ROC-AUC:   {cv['roc_auc_mean']:.4f} (+/- {cv['roc_auc_std']:.4f})")
    print("="*60)

## 6. Threshold Analysis

In [None]:
# Plot threshold analysis
if 'threshold_analysis' in evaluation_results:
    threshold_results = evaluation_results['threshold_analysis']
    
    print("\nOptimal Operating Point:")
    print(f"  Threshold:  {threshold_results['optimal_threshold']:.4f}")
    print(f"  F1 Score:   {threshold_results['optimal_f1']:.4f}")
    print(f"  Precision:  {threshold_results['optimal_precision']:.4f}")
    print(f"  Recall:     {threshold_results['optimal_recall']:.4f}")

## 7. Save Model and Artifacts

In [None]:
# Save model
model.save('../models/model.pkl')
print("✓ Model saved to models/model.pkl")

# Save feature names
feature_names = X.columns.tolist()
with open('../models/feature_names.json', 'w') as f:
    json.dump(feature_names, f, indent=2)
print("✓ Feature names saved to models/feature_names.json")

# Save evaluation results
evaluator.save_results('../reports/metrics/evaluation_results.json')
print("✓ Evaluation results saved to reports/metrics/evaluation_results.json")

# Save best hyperparameters
with open('../reports/metrics/best_hyperparameters.json', 'w') as f:
    json.dump(best_params, f, indent=2)
print("✓ Best hyperparameters saved")

## 8. Model Summary

In [None]:
print("\n" + "="*80)
print("MODEL TRAINING SUMMARY")
print("="*80)
print(f"\nModel Type: XGBoost Ensemble (XGB + RF + LightGBM)")
print(f"Features: {X.shape[1]}")
print(f"Training Samples: {X_train.shape[0]}")
print(f"Test Samples: {X_test.shape[0]}")
print(f"Training Time: {model.training_time:.2f} seconds")

print(f"\nOptimization:")
print(f"  - Optuna trials: {optimizer.n_trials}")
print(f"  - Best CV F1: {optimizer.study.best_value:.4f}")
print(f"  - SMOTE: Enabled")
print(f"  - Calibration: Platt Scaling")

print(f"\nTest Performance:")
print(f"  - F1 Score: {test_metrics['f1_score']:.4f}")
print(f"  - ROC-AUC: {test_metrics['roc_auc']:.4f}")
print(f"  - PR-AUC: {test_metrics['pr_auc']:.4f}")
print(f"  - Business Cost: ${test_metrics['business_cost']:,.2f}")

print("\nNext Steps:")
print("  → SHAP interpretability analysis (Notebook 04)")
print("  → Deploy via FastAPI (src/api.py)")
print("  → Generate business report")

print("\n" + "="*80)
print(f"Training completed: {datetime.now()}")
print("="*80)