In [None]:
"""
Hyperparameter tuning module
Two-stage grid search for optimal XGBoost parameters
"""

import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

from config import *


def load_split_data():
    """Load train/val/test splits from disk."""
    split_path = OUTPUT_DIR / 'split_data.pkl'
    print(f"Loading split data from: {split_path}")
    split_data = joblib.load(split_path)
    
    X_train = split_data['X_train']
    y_train = split_data['y_train']
    X_val = split_data['X_val']
    y_val = split_data['y_val']
    X_test = split_data['X_test']
    y_test = split_data['y_test']
    
    print(f"Train: {len(X_train):,} samples")
    print(f"Val:   {len(X_val):,} samples")
    print(f"Test:  {len(X_test):,} samples")
    
    return X_train, y_train, X_val, y_val, X_test, y_test


def tune_stage1(X_train, y_train, X_val, y_val):
    """
    Stage 1: Coarse grid search.
    
    Args:
        X_train, y_train: Training data
        X_val, y_val: Validation data
    
    Returns:
        Tuple of (best_params, history)
    """
    print("\n" + "="*60)
    print("STAGE 1: COARSE GRID SEARCH")
    print("="*60)
    
    best_acc = 0
    best_params = None
    history = []
    
    # Calculate total combinations
    total = (len(STAGE1_GRID['max_depth']) * 
             len(STAGE1_GRID['learning_rate']) * 
             len(STAGE1_GRID['n_estimators']) * 
             len(STAGE1_GRID['subsample']) * 
             len(STAGE1_GRID['colsample_bytree']) * 
             len(STAGE1_GRID['min_child_weight']) * 
             len(STAGE1_GRID['gamma']))
    
    print(f"Testing {total} combinations...\n")
    
    current = 0
    for max_depth in STAGE1_GRID['max_depth']:
        for lr in STAGE1_GRID['learning_rate']:
            for n_est in STAGE1_GRID['n_estimators']:
                for subsample in STAGE1_GRID['subsample']:
                    for colsample in STAGE1_GRID['colsample_bytree']:
                        for min_child in STAGE1_GRID['min_child_weight']:
                            for gamma in STAGE1_GRID['gamma']:
                                current += 1
                                
                                # Build parameters
                                params = {**XGBOOST_BASE_PARAMS}
                                params.update({
                                    'max_depth': max_depth,
                                    'learning_rate': lr,
                                    'n_estimators': n_est,
                                    'subsample': subsample,
                                    'colsample_bytree': colsample,
                                    'min_child_weight': min_child,
                                    'gamma': gamma
                                })
                                
                                # Train model
                                model = xgb.XGBClassifier(**params)
                                model.fit(
                                    X_train, y_train, 
                                    eval_set=[(X_val, y_val)],
                                    verbose=False
                                )
                                
                                # Evaluate
                                y_val_pred = model.predict(X_val)
                                val_acc = accuracy_score(y_val, y_val_pred)
                                precision, recall, f1, _ = precision_recall_fscore_support(
                                    y_val, y_val_pred, 
                                    average='macro', 
                                    zero_division=0
                                )
                                
                                print(f"[{current:2d}/{total}] "
                                      f"d={max_depth}, lr={lr:.2f}, n={n_est:3d}, "
                                      f"mcw={min_child}, g={gamma:.1f} | "
                                      f"Acc={val_acc:.4f}, F1={f1:.4f}")
                                
                                # Save to history
                                history.append({
                                    'max_depth': max_depth,
                                    'learning_rate': lr,
                                    'n_estimators': n_est,
                                    'subsample': subsample,
                                    'colsample_bytree': colsample,
                                    'min_child_weight': min_child,
                                    'gamma': gamma,
                                    'val_accuracy': float(val_acc),
                                    'val_f1': float(f1)
                                })
                                
                                # Update best
                                if val_acc > best_acc:
                                    best_acc = val_acc
                                    best_params = params.copy()
    
    print(f"\n{'='*60}")
    print(f"STAGE 1 BEST: Acc={best_acc:.4f}")
    print(f"  max_depth: {best_params['max_depth']}")
    print(f"  learning_rate: {best_params['learning_rate']}")
    print(f"  n_estimators: {best_params['n_estimators']}")
    print(f"  min_child_weight: {best_params['min_child_weight']}")
    print(f"  gamma: {best_params['gamma']}")
    
    return best_params, history


def tune_stage2(X_train, y_train, X_val, y_val, stage1_best):
    """
    Stage 2: Fine-tune around best parameters from Stage 1.
    
    Args:
        X_train, y_train: Training data
        X_val, y_val: Validation data
        stage1_best: Best parameters from Stage 1
    
    Returns:
        Tuple of (best_params, history)
    """
    print("\n" + "="*60)
    print("STAGE 2: FINE-TUNING")
    print("="*60)
    
    # Build refined grid around Stage 1 best
    best_depth = stage1_best['max_depth']
    best_lr = stage1_best['learning_rate']
    best_n = stage1_best['n_estimators']
    
    stage2_grid = {
        'max_depth': [
            max(3, best_depth - STAGE2_REFINEMENT['max_depth_range']), 
            best_depth, 
            min(10, best_depth + STAGE2_REFINEMENT['max_depth_range'])
        ],
        'learning_rate': [
            max(0.01, best_lr * mult) 
            for mult in STAGE2_REFINEMENT['learning_rate_multiplier']
        ],
        'n_estimators': [
            max(50, best_n - STAGE2_REFINEMENT['n_estimators_step']), 
            best_n, 
            best_n + STAGE2_REFINEMENT['n_estimators_step']
        ],
        'subsample': STAGE2_REFINEMENT['subsample_options'],
        'colsample_bytree': STAGE2_REFINEMENT['colsample_bytree_options'],
        'min_child_weight': [stage1_best['min_child_weight']],
        'gamma': [stage1_best['gamma']]
    }
    
    best_acc = 0
    best_params = None
    history = []
    
    # Calculate total combinations
    total = (len(stage2_grid['max_depth']) * 
             len(stage2_grid['learning_rate']) * 
             len(stage2_grid['n_estimators']) * 
             len(stage2_grid['subsample']) * 
             len(stage2_grid['colsample_bytree']))
    
    print(f"Testing {total} refined combinations...\n")
    
    current = 0
    for max_depth in stage2_grid['max_depth']:
        for lr in stage2_grid['learning_rate']:
            for n_est in stage2_grid['n_estimators']:
                for subsample in stage2_grid['subsample']:
                    for colsample in stage2_grid['colsample_bytree']:
                        current += 1
                        
                        # Build parameters
                        params = {**XGBOOST_BASE_PARAMS}
                        params.update({
                            'max_depth': max_depth,
                            'learning_rate': lr,
                            'n_estimators': n_est,
                            'subsample': subsample,
                            'colsample_bytree': colsample,
                            'min_child_weight': stage2_grid['min_child_weight'][0],
                            'gamma': stage2_grid['gamma'][0]
                        })
                        
                        # Train model
                        model = xgb.XGBClassifier(**params)
                        model.fit(
                            X_train, y_train,
                            eval_set=[(X_val, y_val)],
                            verbose=False
                        )
                        
                        # Evaluate
                        y_val_pred = model.predict(X_val)
                        val_acc = accuracy_score(y_val, y_val_pred)
                        precision, recall, f1, _ = precision_recall_fscore_support(
                            y_val, y_val_pred, 
                            average='macro', 
                            zero_division=0
                        )
                        
                        if current % 10 == 0 or val_acc > best_acc:
                            print(f"[{current:2d}/{total}] "
                                  f"d={max_depth}, lr={lr:.3f}, n={n_est:3d}, "
                                  f"ss={subsample:.2f}, cs={colsample:.2f} | "
                                  f"Acc={val_acc:.4f}, F1={f1:.4f}")
                        
                        # Save to history
                        history.append({
                            'max_depth': max_depth,
                            'learning_rate': lr,
                            'n_estimators': n_est,
                            'subsample': subsample,
                            'colsample_bytree': colsample,
                            'val_accuracy': float(val_acc),
                            'val_f1': float(f1)
                        })
                        
                        # Update best
                        if val_acc > best_acc:
                            best_acc = val_acc
                            best_params = params
    
    print(f"\n{'='*60}")
    print(f"FINAL BEST PARAMETERS (Val Accuracy: {best_acc:.4f})")
    print(f"{'='*60}")
    print(f"  max_depth: {best_params['max_depth']}")
    print(f"  learning_rate: {best_params['learning_rate']:.4f}")
    print(f"  n_estimators: {best_params['n_estimators']}")
    print(f"  subsample: {best_params['subsample']:.2f}")
    print(f"  colsample_bytree: {best_params['colsample_bytree']:.2f}")
    print(f"  min_child_weight: {best_params['min_child_weight']}")
    print(f"  gamma: {best_params['gamma']}")
    
    return best_params, history


def save_tuning_results(stage1_best, stage2_best, stage1_history, stage2_history):
    """Save tuning history to disk."""
    full_history = {
        'stage1': stage1_history,
        'stage2': stage2_history,
        'stage1_best': stage1_best,
        'stage2_best': stage2_best
    }
    
    history_path = OUTPUT_DIR / 'tuning_history.json'
    with open(history_path, 'w') as f:
        json.dump(full_history, f, indent=2)
    
    print(f"\n✓ Saved tuning history to: {history_path}")
    
    # Also save just the best parameters separately
    best_params_path = OUTPUT_DIR / 'best_params.json'
    with open(best_params_path, 'w') as f:
        json.dump(stage2_best, f, indent=2)
    
    print(f"✓ Saved best parameters to: {best_params_path}")


def main():
    """Main hyperparameter tuning pipeline."""
    print("="*60)
    print("HYPERPARAMETER TUNING PIPELINE")
    print("Two-Stage Grid Search")
    print("="*60)
    
    # Load data
    X_train, y_train, X_val, y_val, X_test, y_test = load_split_data()
    
    # Stage 1: Coarse search
    stage1_best, stage1_history = tune_stage1(X_train, y_train, X_val, y_val)
    
    # Stage 2: Fine-tuning
    stage2_best, stage2_history = tune_stage2(
        X_train, y_train, X_val, y_val, stage1_best
    )
    
    # Save results
    save_tuning_results(stage1_best, stage2_best, stage1_history, stage2_history)
    
    print("\n" + "="*60)
    print("HYPERPARAMETER TUNING COMPLETE")
    print("="*60)
    print(f"Stage 1: Tested {len(stage1_history)} combinations")
    print(f"Stage 2: Tested {len(stage2_history)} combinations")
    print(f"Total combinations: {len(stage1_history) + len(stage2_history)}")
    print(f"\nBest parameters saved to: {OUTPUT_DIR / 'best_params.json'}")


if __name__ == "__main__":
    main()