In [None]:
"""
Final model training and evaluation module
Trains the XGBoost model with best parameters and generates predictions
"""

import pandas as pd
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, 
    precision_recall_fscore_support,
    classification_report, 
    confusion_matrix
)
import joblib
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from config import *


def load_split_data():
    """Load train/val/test splits from disk."""
    split_path = OUTPUT_DIR / 'split_data.pkl'
    print(f"Loading split data from: {split_path}")
    split_data = joblib.load(split_path)
    
    X_train = split_data['X_train']
    y_train = split_data['y_train']
    X_val = split_data['X_val']
    y_val = split_data['y_val']
    X_test = split_data['X_test']
    y_test = split_data['y_test']
    
    print(f"Train: {len(X_train):,} samples")
    print(f"Val:   {len(X_val):,} samples")
    print(f"Test:  {len(X_test):,} samples")
    
    return X_train, y_train, X_val, y_val, X_test, y_test


def load_best_params():
    """Load best hyperparameters from tuning."""
    params_path = OUTPUT_DIR / 'best_params.json'
    print(f"\nLoading best parameters from: {params_path}")
    
    with open(params_path, 'r') as f:
        params = json.load(f)
    
    print("Best parameters:")
    for key, value in params.items():
        if key not in ['objective', 'num_class', 'eval_metric', 'early_stopping_rounds']:
            print(f"  {key}: {value}")
    
    return params


def train_final_model(X_train, y_train, X_val, y_val, X_test, y_test, params):
    """
    Train final XGBoost model with best parameters.
    
    Args:
        X_train, y_train: Training data
        X_val, y_val: Validation data
        X_test, y_test: Test data
        params: Best hyperparameters
    
    Returns:
        Tuple of (model, results, y_train_proba, y_val_proba, y_test_proba)
    """
    print("\n" + "="*60)
    print("TRAINING FINAL MODEL")
    print("="*60)
    
    # Train model
    model = xgb.XGBClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    print("✓ Model training complete")
    
    # Generate predictions
    print("\nGenerating predictions...")
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    y_train_proba = model.predict_proba(X_train)
    y_val_proba = model.predict_proba(X_val)
    y_test_proba = model.predict_proba(X_test)
    
    # Calculate metrics
    results = {}
    
    for name, y_true, y_pred in [
        ('train', y_train, y_train_pred),
        ('val', y_val, y_val_pred),
        ('test', y_test, y_test_pred)
    ]:
        acc = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average='macro', zero_division=0
        )
        
        results[name] = {
            'accuracy': float(acc),
            'precision': float(precision),
            'recall': float(recall),
            'f1': float(f1)
        }
    
    # Print performance summary
    print("\n" + "="*60)
    print("MODEL PERFORMANCE")
    print("="*60)
    print(f"{'Set':<10} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1':<12}")
    print("-"*60)
    for name in ['train', 'val', 'test']:
        r = results[name]
        print(f"{name.capitalize():<10} {r['accuracy']:.4f}       "
              f"{r['precision']:.4f}       {r['recall']:.4f}       {r['f1']:.4f}")
    
    # Detailed test set report
    print("\n" + "="*60)
    print("DETAILED TEST SET CLASSIFICATION REPORT")
    print("="*60)
    print(classification_report(
        y_test, y_test_pred, 
        target_names=['Down (0)', 'Neutral (1)', 'Up (2)']
    ))
    
    # Confusion matrix
    print("Confusion Matrix (Test Set):")
    cm = confusion_matrix(y_test, y_test_pred)
    print(cm)
    print("\nRows = Actual, Columns = Predicted")
    print("Order: [Down, Neutral, Up]")
    
    return model, results, y_train_proba, y_val_proba, y_test_proba


def save_model_and_predictions(model, results, 
                               X_train, X_val, X_test,
                               y_train, y_val, y_test,
                               y_train_proba, y_val_proba, y_test_proba,
                               selected_features):
    """
    Save trained model, predictions, and analysis.
    
    Args:
        model: Trained XGBoost model
        results: Performance metrics dict
        X_train, X_val, X_test: Feature matrices
        y_train, y_val, y_test: True labels
        y_train_proba, y_val_proba, y_test_proba: Prediction probabilities
        selected_features: List of feature names
    """
    print("\n" + "="*60)
    print("SAVING OUTPUTS")
    print("="*60)
    print(f"Output directory: {OUTPUT_DIR}")
    
    # ========================================================================
    # Save model
    # ========================================================================
    model_path = OUTPUT_DIR / OUTPUT_FILES['model']
    model.save_model(str(model_path))
    print(f"\n✓ {OUTPUT_FILES['model']}")
    
    # ========================================================================
    # Save training history
    # ========================================================================
    # Load tuning history
    tuning_path = OUTPUT_DIR / 'tuning_history.json'
    with open(tuning_path, 'r') as f:
        tuning_history = json.load(f)
    
    history = {
        'hyperparameter_tuning': tuning_history,
        'final_results': results,
        'num_features': len(selected_features),
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }
    
    history_path = OUTPUT_DIR / OUTPUT_FILES['training_history']
    with open(history_path, 'w') as f:
        json.dump(history, f, indent=2)
    print(f"✓ {OUTPUT_FILES['training_history']}")
    
    # ========================================================================
    # Save predictions
    # ========================================================================
    # Training predictions
    train_predictions = pd.DataFrame({
        'actual': y_train.values,
        'predicted': y_train_proba.argmax(axis=1),
        'prob_down': y_train_proba[:, 0],
        'prob_neutral': y_train_proba[:, 1],
        'prob_up': y_train_proba[:, 2]
    })
    train_pred_path = OUTPUT_DIR / OUTPUT_FILES['train_predictions']
    train_predictions.to_csv(train_pred_path, index=False)
    print(f"✓ {OUTPUT_FILES['train_predictions']}")
    
    # Validation predictions
    val_predictions = pd.DataFrame({
        'actual': y_val.values,
        'predicted': y_val_proba.argmax(axis=1),
        'prob_down': y_val_proba[:, 0],
        'prob_neutral': y_val_proba[:, 1],
        'prob_up': y_val_proba[:, 2]
    })
    val_pred_path = OUTPUT_DIR / OUTPUT_FILES['val_predictions']
    val_predictions.to_csv(val_pred_path, index=False)
    print(f"✓ {OUTPUT_FILES['val_predictions']}")
    
    # Test predictions
    test_predictions = pd.DataFrame({
        'actual': y_test.values,
        'predicted': y_test_proba.argmax(axis=1),
        'prob_down': y_test_proba[:, 0],
        'prob_neutral': y_test_proba[:, 1],
        'prob_up': y_test_proba[:, 2]
    })
    test_pred_path = OUTPUT_DIR / OUTPUT_FILES['test_predictions']
    test_predictions.to_csv(test_pred_path, index=False)
    print(f"✓ {OUTPUT_FILES['test_predictions']} (Day 5 - ready for agent!)")
    
    # ========================================================================
    # Feature importance analysis
    # ========================================================================
    print("\n" + "="*60)
    print("FEATURE IMPORTANCE ANALYSIS")
    print("="*60)
    
    feature_importance = model.feature_importances_
    importance_df = pd.DataFrame({
        'feature': selected_features,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    for idx, row in importance_df.head(10).iterrows():
        print(f"{row['feature']:30s} {row['importance']:.6f}")
    
    importance_path = OUTPUT_DIR / OUTPUT_FILES['feature_importance']
    importance_df.to_csv(importance_path, index=False)
    print(f"\n✓ {OUTPUT_FILES['feature_importance']}")


def main():
    """Main training pipeline."""
    print("="*60)
    print("FINAL MODEL TRAINING PIPELINE")
    print("="*60)
    
    # Load data
    X_train, y_train, X_val, y_val, X_test, y_test = load_split_data()
    
    # Load best parameters
    best_params = load_best_params()
    
    # Train final model
    model, results, y_train_proba, y_val_proba, y_test_proba = train_final_model(
        X_train, y_train, X_val, y_val, X_test, y_test, best_params
    )
    
    # Load selected features
    features_path = OUTPUT_DIR / OUTPUT_FILES['selected_features']
    selected_features = joblib.load(features_path)
    
    # Save everything
    save_model_and_predictions(
        model, results,
        X_train, X_val, X_test,
        y_train, y_val, y_test,
        y_train_proba, y_val_proba, y_test_proba,
        selected_features
    )
    
    print("\n" + "="*60)
    print("FINAL MODEL TRAINING COMPLETE!")
    print("="*60)
    print(f"\nModel Performance Summary:")
    print(f"  Test Accuracy: {results['test']['accuracy']:.4f}")
    print(f"  Test F1 Score: {results['test']['f1']:.4f}")
    print(f"  Features Used: {len(selected_features)}")
    print(f"\nAll files saved to: {OUTPUT_DIR}")
    print(f"\nKey output: {OUTPUT_FILES['test_predictions']} (ready for agent!)")


if __name__ == "__main__":
    main()