# Modular Training Pipeline

This notebook trains multiple ML models with configurable imbalance handling methods in parallel.

**Features:**
- 11 ML models supported
- 6 imbalance handling methods
- Parallel training (n_workers configurable)
- Binary and multiclass classification
- Automatic result tracking

## 1. Import Libraries and Modules

In [None]:
import yaml
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from datasets import *
from modules import *

print("‚úÖ All modules imported successfully")

## 2. Helper Functions for Result Management

In [None]:
def create_run_folder():
    """Create timestamped run folder"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_folder = f"results/run_{timestamp}"
    os.makedirs(run_folder, exist_ok=True)
    os.makedirs(f"{run_folder}/plots", exist_ok=True)
    return run_folder

def save_summary(run_folder, config, metrics_df, dataset_info):
    """Save quick summary JSON"""
    best_idx = metrics_df['auc'].idxmax()
    
    summary = {
        "experiment": config['experiment'],
        "dataset": dataset_info,
        "models": config['models']['active'],
        "imbalance_methods": config['imbalance']['methods'],
        "timestamp": datetime.now().isoformat(),
        "best_result": {
            "job": f"{metrics_df.loc[best_idx, 'model']}_{metrics_df.loc[best_idx, 'imbalance_method']}",
            "model": metrics_df.loc[best_idx, 'model'],
            "imbalance_method": metrics_df.loc[best_idx, 'imbalance_method'],
            "auc": float(metrics_df.loc[best_idx, 'auc']),
            "accuracy": float(metrics_df.loc[best_idx, 'accuracy'])
        },
        "config": config
    }
    
    with open(f"{run_folder}/summary.json", 'w') as f:
        json.dump(summary, f, indent=2)

def save_metrics(run_folder, metrics_df):
    """Save detailed metrics CSV"""
    metrics_df.to_csv(f"{run_folder}/metrics.csv", index=False)

def save_training_log(run_folder, log_messages):
    """Save raw training logs"""
    with open(f"{run_folder}/training.log", 'w') as f:
        f.write('\n'.join(log_messages))

print("‚úÖ Helper functions defined")

## 3. Load Configuration

In [None]:
# Load configuration
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("üìã Configuration loaded:")
print(f"  Experiment: {config['experiment']['name']}")
print(f"  Dataset: {config['dataset']['name']}")
print(f"  Models: {len(config['models']['active'])}")
print(f"  Imbalance methods: {len(config['imbalance']['methods'])}")
print(f"  Parallel workers: {config['training']['n_workers']}")

## 4. Create Run Folder and Load Dataset

In [None]:
# Create run folder
run_folder = create_run_folder()
log = []

# Optional: create models folder if saving models
if config['output']['save_models']:
    os.makedirs(f"{run_folder}/models", exist_ok=True)

print(f"üìÅ Results will be saved to: {run_folder}")

# Load dataset
dataset_name = config['dataset']['name']
dataset_class = eval(f"{dataset_name.title().replace('_', '')}Dataset")

dataset = dataset_class(
    train_path=config['dataset'].get('train_path'),
    test_path=config['dataset'].get('test_path')
)

X_train, y_train, X_test, y_test = dataset.load()

log.append(f"Loaded dataset: {dataset.name}")
log.append(f"Task type: {dataset.task_type}")
log.append(f"Training samples: {len(X_train)}")
log.append(f"Test samples: {len(X_test)}")
log.append(f"Features: {X_train.shape[1]}")
log.append(f"Classes: {np.unique(y_train)}")

print(f"\nüìä Dataset: {dataset.name}")
print(f"  Task: {dataset.task_type}")
print(f"  Train: {X_train.shape}")
print(f"  Test: {X_test.shape}")
print(f"  Classes: {np.unique(y_train)}")

## 5. Define Training Job Function

In [None]:
def train_single_job(model_name, imbalance_method, X_train, y_train, X_test, y_test, config, dataset):
    """Train a single model with given imbalance method"""
    job_log = []
    job_name = f"{model_name}_{imbalance_method}"
    
    # Apply imbalance handling
    imbalance_handler = ImbalanceHandler(
        method=imbalance_method,
        **config['imbalance']['params']
    )
    X_train_balanced, y_train_balanced = imbalance_handler.apply(
        X_train.copy(), y_train.copy(), task_type=dataset.task_type
    )
    job_log.append(f"Samples: {len(X_train)} -> {len(X_train_balanced)}")
    
    # Create model
    params = config['models'].get('params', {}).get(model_name, {})
    model = ModelFactory.create_model(model_name, params)
    
    # Add scaler if needed
    scaler = StandardScaler() if ModelFactory.requires_scaling(model_name) else None
    
    # Train
    trainer = Trainer(model, model_name, scaler)
    trainer.train(X_train_balanced, y_train_balanced)
    job_log.append(f"Training time: {trainer.train_time:.2f}s")
    
    # Predict
    if dataset.task_type == "binary":
        y_pred_proba = trainer.predict_proba(X_test)
    else:
        # For multiclass, get full probability matrix
        if scaler:
            X_test_scaled = scaler.transform(X_test)
            y_pred_proba = model.predict_proba(X_test_scaled)
        else:
            y_pred_proba = model.predict_proba(X_test)
    
    # Evaluate
    eval_config = config['evaluation']
    evaluator = Evaluator(
        thresholds=eval_config['thresholds'],
        task_type=dataset.task_type,
        average=eval_config.get('multiclass', {}).get('average', 'macro')
    )
    metrics = evaluator.evaluate_model(y_test, y_pred_proba, model_name, imbalance_method, trainer.train_time)
    
    return {
        'job_name': job_name,
        'model_name': model_name,
        'imbalance_method': imbalance_method,
        'metrics': metrics,
        'y_pred_proba': y_pred_proba,
        'trainer': trainer,
        'log': job_log
    }

print("‚úÖ Training job function defined")

## 6. Generate Training Jobs and Train in Parallel

In [None]:
# Generate all training jobs (model x imbalance_method combinations)
training_jobs = []
for model_name in config['models']['active']:
    for imbalance_method in config['imbalance']['methods']:
        training_jobs.append((model_name, imbalance_method))

print(f"\nüöÄ Starting parallel training...")
print(f"  Total jobs: {len(training_jobs)}")
print(f"  Workers: {config['training']['n_workers']}")
print(f"\n{'='*60}")

log.append(f"\nTotal training jobs: {len(training_jobs)}")
log.append(f"Parallel workers: {config['training']['n_workers']}")

# Train all jobs in parallel
results = Parallel(n_jobs=config['training']['n_workers'], verbose=10)(
    delayed(train_single_job)(
        model_name, imbalance_method, 
        X_train, y_train, X_test, y_test, 
        config, dataset
    )
    for model_name, imbalance_method in training_jobs
)

print(f"{'='*60}")
print(f"\n‚úÖ All training jobs completed!")

## 7. Collect Results and Save Models

In [None]:
# Collect results
results_dict = {}  # For visualization: {job_name: y_pred_proba}
all_metrics = []

for result in results:
    job_name = result['job_name']
    results_dict[job_name] = result['y_pred_proba']
    all_metrics.extend(result['metrics'])
    
    log.append(f"\n{job_name}:")
    for msg in result['log']:
        log.append(f"  {msg}")
    
    # Save model if configured
    if config['output']['save_models']:
        result['trainer'].save_model(f"{run_folder}/models/{job_name}.pkl")
        log.append(f"  Model saved")

# Create metrics DataFrame
metrics_df = pd.DataFrame(all_metrics)

print(f"üìä Results collected:")
print(f"  Total models trained: {len(results)}")
print(f"  Metrics computed: {len(all_metrics)}")

## 8. Display Results

In [None]:
# Display metrics table
print("\nüìã Performance Metrics (Threshold = 0.5):\n")

if 'threshold' in metrics_df.columns:
    display_df = metrics_df[metrics_df['threshold'] == 0.5].copy()
else:
    display_df = metrics_df.copy()

display_df = display_df.sort_values('auc', ascending=False)
display(display_df[['model', 'imbalance_method', 'accuracy', 'auc', 'f1', 'train_time']])

## 9. Visualize Results

In [None]:
visualizer = Visualizer()

if config['output']['save_plots']:
    if dataset.task_type == "binary":
        # Binary classification plots
        visualizer.plot_roc_curves(results_dict, y_test, f"{run_folder}/plots/roc_curves.png")
        visualizer.plot_confusion_matrices(results_dict, y_test, 0.5, f"{run_folder}/plots/confusion_matrices.png")
    else:
        # Multi-class plots
        visualizer.plot_multiclass_roc(results_dict, y_test, f"{run_folder}/plots/roc_curves_multiclass.png")
    
    # Metrics comparison (works for both binary and multiclass)
    visualizer.plot_metrics_comparison(metrics_df, f"{run_folder}/plots/metrics_comparison.png")
    
    print(f"\nüìä Plots saved to {run_folder}/plots/")

## 10. Save Results and Summary

In [None]:
# Save all results
save_summary(run_folder, config, metrics_df, dataset.get_info())
save_metrics(run_folder, metrics_df)
save_training_log(run_folder, log)

# Print summary
best_idx = metrics_df['auc'].idxmax()
best_model = metrics_df.loc[best_idx, 'model']
best_method = metrics_df.loc[best_idx, 'imbalance_method']
best_auc = metrics_df.loc[best_idx, 'auc']
best_acc = metrics_df.loc[best_idx, 'accuracy']

print(f"\n{'='*60}")
print(f"‚úÖ Run Complete!")
print(f"{'='*60}")
print(f"\nüìÅ Results saved to: {run_folder}")
print(f"\nüèÜ Best Model:")
print(f"  - Model: {best_model}")
print(f"  - Imbalance method: {best_method}")
print(f"  - AUC: {best_auc:.4f}")
print(f"  - Accuracy: {best_acc:.4f}")
print(f"\nüìÑ Files created:")
print(f"  - summary.json (quick overview)")
print(f"  - metrics.csv (detailed metrics)")
print(f"  - training.log (execution logs)")
if config['output']['save_plots']:
    print(f"  - plots/ (visualizations)")
if config['output']['save_models']:
    print(f"  - models/ (trained models)")
print(f"\n{'='*60}")