## ðŸš€ Google Colab Setup

**Run these commands in Colab before running the notebook:**

In [None]:
# Clone repository
!git clone https://github.com/YOUR_USERNAME/YOUR_REPO_NAME.git
%cd YOUR_REPO_NAME

# Install package in development mode
!pip install -e .

# Active Learning Training Pipeline

This notebook implements **pseudo-labeling with active learning** for lung cancer survival prediction.

## Two Approaches:
1. **Combined**: Randomly select labeled ratio from both train and test sets combined
2. **Train Only**: Apply active learning on training set, evaluate on separate test set

## Key Features:
- Progressive confidence threshold (starts low, increases over epochs)
- Entropy-based confidence scoring
- Imbalance handling applied only to initial labeled data
- Parallel training across all models Ã— imbalance methods
- Comprehensive metrics tracking per epoch

## 1. Import Libraries

In [None]:
import yaml
import numpy as np
import pandas as pd
from datetime import datetime
import os
import sys
from pathlib import Path
import time
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import seaborn as sns

# Import custom modules
from datasets.lung_cancer import LungCancerDataset
from modules.imbalance_handler import ImbalanceHandler
from modules.models import ModelFactory
from modules.trainer import Trainer
from modules.evaluator import Evaluator
from modules.visualizer import Visualizer
from modules.active_learning import (
    active_learning_cycle,
    split_data_combined,
    split_data_train_only
)

print("âœ“ All libraries imported successfully")

## 2. Load Configuration

In [None]:
# Load configuration
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Extract settings
RANDOM_SEED = config['random_seed']
np.random.seed(RANDOM_SEED)

# Active learning parameters
al_config = config['active_learning']
APPROACH = al_config['approach']  # "combined" or "train_only"
LABELED_RATIO = al_config['labeled_ratio']
VALIDATION_RATIO = al_config['validation_ratio']
CONFIDENCE_THRESHOLD = al_config['confidence_threshold']
USE_DYNAMIC_THRESHOLD = al_config['use_dynamic_threshold']
TAU_MIN = al_config['tau_min']
TAU_MAX = al_config['tau_max']
MAX_EPOCHS = al_config['max_epochs']
APPLY_IMBALANCE_TO_INITIAL = al_config['apply_imbalance_to_initial']

# Model and imbalance settings
ACTIVE_MODELS = config['models']['active']
IMBALANCE_METHODS = config['imbalance']['methods']
N_WORKERS = config['training']['n_workers']

print("Configuration loaded:")
print(f"  Approach: {APPROACH}")
print(f"  Labeled Ratio: {LABELED_RATIO:.1%}")
print(f"  Validation Ratio: {VALIDATION_RATIO:.1%}")
print(f"  Confidence Threshold: {CONFIDENCE_THRESHOLD} (Dynamic: {USE_DYNAMIC_THRESHOLD})")
print(f"  Max Epochs: {MAX_EPOCHS}")
print(f"  Models: {len(ACTIVE_MODELS)}")
print(f"  Imbalance Methods: {len(IMBALANCE_METHODS)}")
print(f"  Parallel Workers: {N_WORKERS}")

## 3. Load Dataset and Split Data

In [None]:
# Load dataset
dataset = LungCancerDataset()
dataset_info = dataset.get_info()
X_train_orig, y_train_orig, X_test_orig, y_test_orig = dataset.load()

print(f"Dataset: {dataset_info['name']}")
print(f"  Description: {dataset_info['description']}")
print(f"  Task Type: {dataset_info['task_type']}")
print(f"  Original Train: {X_train_orig.shape}")
print(f"  Original Test: {X_test_orig.shape}")
print()

# Convert to numpy arrays
X_train_orig = X_train_orig.values
y_train_orig = y_train_orig.values
X_test_orig = X_test_orig.values
y_test_orig = y_test_orig.values

# Split data based on approach
if APPROACH == "combined":
    print(f"=== Approach 1: Combined Dataset Split ===")
    X_labeled, y_labeled, X_val, y_val, X_unlabeled, y_unlabeled = split_data_combined(
        X_train_orig, y_train_orig, X_test_orig, y_test_orig,
        labeled_ratio=LABELED_RATIO,
        validation_ratio=VALIDATION_RATIO,
        random_state=RANDOM_SEED
    )
    X_test_final = None
    y_test_final = None
    
    print(f"  Initial Labeled: {X_labeled.shape[0]} samples")
    print(f"  Validation: {X_val.shape[0]} samples")
    print(f"  Unlabeled Pool: {X_unlabeled.shape[0]} samples")
    print(f"  (No separate test set - unlabeled pool serves as test data)")
    
elif APPROACH == "train_only":
    print(f"=== Approach 2: Train-Only Split ===")
    X_labeled, y_labeled, X_val, y_val, X_unlabeled, y_unlabeled, X_test_final, y_test_final = split_data_train_only(
        X_train_orig, y_train_orig, X_test_orig, y_test_orig,
        labeled_ratio=LABELED_RATIO,
        validation_ratio=VALIDATION_RATIO,
        random_state=RANDOM_SEED
    )
    
    print(f"  Initial Labeled: {X_labeled.shape[0]} samples")
    print(f"  Validation: {X_val.shape[0]} samples")
    print(f"  Unlabeled Pool: {X_unlabeled.shape[0]} samples")
    print(f"  Final Test Set: {X_test_final.shape[0]} samples")
else:
    raise ValueError(f"Unknown approach: {APPROACH}")

print(f"\nClass distribution in initial labeled data: {np.bincount(y_labeled.astype(int))}")

## 4. Define Training Job Function

In [None]:
def train_active_learning_job(model_name, imbalance_method, 
                               X_labeled_init, y_labeled_init,
                               X_val, y_val,
                               X_unlabeled, y_unlabeled,
                               X_test_final, y_test_final,
                               config):
    """
    Train a single model using active learning
    
    Returns:
        Dictionary with model name, imbalance method, metrics history, and final results
    """
    job_start = time.time()
    
    # Copy data to avoid modifying originals
    X_labeled = X_labeled_init.copy()
    y_labeled = y_labeled_init.copy()
    X_unlab = X_unlabeled.copy()
    y_unlab = y_unlabeled.copy()
    
    # Apply imbalance handling to initial labeled data
    if imbalance_method != "none" and APPLY_IMBALANCE_TO_INITIAL:
        imbalance_handler = ImbalanceHandler(imbalance_method, random_state=RANDOM_SEED)
        X_labeled, y_labeled = imbalance_handler.apply(
            X_labeled, y_labeled, 
            task_type=dataset_info['task_type']
        )
    
    # Create and train initial model
    model_factory = ModelFactory(config)
    model = model_factory.create_model(model_name)
    trainer = Trainer(model, model_name, random_state=RANDOM_SEED)
    
    # Train initial model
    trainer.train(X_labeled, y_labeled)
    
    # Track metrics per epoch
    metrics_history = []
    
    # Initial evaluation
    evaluator = Evaluator()
    initial_val_metrics = evaluator.evaluate_model(
        trainer.model, X_val, y_val,
        task_type=dataset_info['task_type']
    )
    
    metrics_history.append({
        'epoch': 0,
        'threshold': 0.0,
        'pseudo_labeled': 0,
        'labeled_size': len(X_labeled_init),
        'unlabeled_remaining': len(X_unlab),
        'val_acc': initial_val_metrics['accuracy'],
        'val_auc': initial_val_metrics.get('auc', 0.0)
    })
    
    # Active learning cycles
    for epoch in range(1, MAX_EPOCHS + 1):
        model_updated, X_labeled, y_labeled, X_unlab, y_unlab, epoch_metrics = active_learning_cycle(
            model=trainer.model,
            X_labeled=X_labeled,
            y_labeled=y_labeled,
            X_unlabeled=X_unlab,
            y_unlabeled=y_unlab,
            X_val=X_val,
            y_val=y_val,
            epoch=epoch,
            total_epochs=MAX_EPOCHS,
            use_dynamic_threshold=USE_DYNAMIC_THRESHOLD,
            confidence_threshold=CONFIDENCE_THRESHOLD,
            tau_min=TAU_MIN,
            tau_max=TAU_MAX,
            verbose=False  # Suppress per-job output
        )
        
        # Update trainer's model
        trainer.model = model_updated
        
        # Track metrics
        epoch_metrics['labeled_size'] = len(X_labeled)
        epoch_metrics['val_auc'] = evaluator.evaluate_model(
            trainer.model, X_val, y_val,
            task_type=dataset_info['task_type']
        ).get('auc', 0.0)
        
        metrics_history.append(epoch_metrics)
        
        # Stop if no more unlabeled data
        if len(X_unlab) == 0:
            break
    
    # Final evaluation
    if X_test_final is not None:
        # Approach 2: Evaluate on separate test set
        final_metrics = evaluator.evaluate_model(
            trainer.model, X_test_final, y_test_final,
            task_type=dataset_info['task_type']
        )
        eval_set = "test"
    else:
        # Approach 1: Evaluate on validation set (no separate test)
        final_metrics = evaluator.evaluate_model(
            trainer.model, X_val, y_val,
            task_type=dataset_info['task_type']
        )
        eval_set = "validation"
    
    job_time = time.time() - job_start
    
    return {
        'model_name': model_name,
        'imbalance_method': imbalance_method,
        'metrics_history': pd.DataFrame(metrics_history),
        'final_metrics': final_metrics,
        'eval_set': eval_set,
        'training_time': job_time,
        'final_labeled_size': len(X_labeled),
        'total_epochs': len(metrics_history) - 1
    }

print("âœ“ Training job function defined")

## 5. Run Parallel Active Learning Training

In [None]:
# Create job list
jobs = [
    (model_name, imbalance_method)
    for model_name in ACTIVE_MODELS
    for imbalance_method in IMBALANCE_METHODS
]

print(f"Starting {len(jobs)} active learning training jobs with {N_WORKERS} workers...")
print(f"Models: {ACTIVE_MODELS}")
print(f"Imbalance Methods: {IMBALANCE_METHODS}")
print()

# Run parallel training
training_start = time.time()

results = Parallel(n_jobs=N_WORKERS, verbose=10)(
    delayed(train_active_learning_job)(
        model_name, imbalance_method,
        X_labeled, y_labeled,
        X_val, y_val,
        X_unlabeled, y_unlabeled,
        X_test_final, y_test_final,
        config
    )
    for model_name, imbalance_method in jobs
)

total_training_time = time.time() - training_start

print(f"\n{'='*60}")
print(f"âœ“ All {len(results)} jobs completed in {total_training_time:.2f}s")
print(f"{'='*60}")

## 6. Organize Results

In [None]:
# Create summary DataFrame
summary_data = []
for result in results:
    summary_data.append({
        'model': result['model_name'],
        'imbalance': result['imbalance_method'],
        'final_accuracy': result['final_metrics']['accuracy'],
        'final_auc': result['final_metrics'].get('auc', 0.0),
        'final_f1': result['final_metrics'].get('f1', 0.0),
        'final_sensitivity': result['final_metrics'].get('sensitivity', 0.0),
        'final_specificity': result['final_metrics'].get('specificity', 0.0),
        'training_time': result['training_time'],
        'total_epochs': result['total_epochs'],
        'final_labeled_size': result['final_labeled_size'],
        'eval_set': result['eval_set']
    })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('final_auc', ascending=False)

print("Top 10 Models by AUC:")
print(summary_df.head(10).to_string(index=False))
print()

# Best model
best_result = results[summary_df.index[0]]
print(f"Best Model: {best_result['model_name']} + {best_result['imbalance_method']}")
print(f"  Final AUC: {best_result['final_metrics']['auc']:.4f}")
print(f"  Final Accuracy: {best_result['final_metrics']['accuracy']:.4f}")
print(f"  Training Time: {best_result['training_time']:.2f}s")
print(f"  Total Epochs: {best_result['total_epochs']}")
print(f"  Final Labeled Size: {best_result['final_labeled_size']}")

## 7. Visualize Learning Curves

In [None]:
# Plot learning curves for top 5 models
top_5_indices = summary_df.head(5).index

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, result_idx in enumerate(top_5_indices):
    result = results[result_idx]
    history = result['metrics_history']
    
    ax = axes[idx]
    
    # Plot validation accuracy and AUC
    ax.plot(history['epoch'], history['val_acc'], marker='o', label='Val Accuracy', linewidth=2)
    ax.plot(history['epoch'], history['val_auc'], marker='s', label='Val AUC', linewidth=2)
    
    ax.set_xlabel('Epoch', fontsize=10)
    ax.set_ylabel('Score', fontsize=10)
    ax.set_title(f"{result['model_name']}\n({result['imbalance_method']})", fontsize=11, fontweight='bold')
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.set_ylim([0, 1.05])

# Plot pseudo-label growth for best model
best_result = results[summary_df.index[0]]
best_history = best_result['metrics_history']

ax = axes[5]
ax2 = ax.twinx()

# Labeled size
ax.plot(best_history['epoch'], best_history['labeled_size'], 
        marker='o', color='green', linewidth=2, label='Labeled Size')
ax.set_xlabel('Epoch', fontsize=10)
ax.set_ylabel('Labeled Size', fontsize=10, color='green')
ax.tick_params(axis='y', labelcolor='green')

# Unlabeled remaining
ax2.plot(best_history['epoch'], best_history['unlabeled_remaining'], 
         marker='s', color='red', linewidth=2, label='Unlabeled Remaining')
ax2.set_ylabel('Unlabeled Remaining', fontsize=10, color='red')
ax2.tick_params(axis='y', labelcolor='red')

ax.set_title(f"Best Model: {best_result['model_name']}\nData Growth", 
             fontsize=11, fontweight='bold')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("âœ“ Learning curves plotted")

## 8. Compare Final Performance

In [None]:
# Compare models by imbalance method
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

metrics_to_plot = ['final_auc', 'final_accuracy', 'final_f1']
titles = ['AUC', 'Accuracy', 'F1-Score']

for idx, (metric, title) in enumerate(zip(metrics_to_plot, titles)):
    ax = axes[idx]
    
    # Group by imbalance method
    for imbalance in IMBALANCE_METHODS:
        subset = summary_df[summary_df['imbalance'] == imbalance]
        ax.scatter(range(len(subset)), subset[metric], 
                  label=imbalance, alpha=0.7, s=100)
    
    ax.set_xlabel('Model Index', fontsize=11)
    ax.set_ylabel(title, fontsize=11)
    ax.set_title(f'Final {title} Comparison', fontsize=12, fontweight='bold')
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.set_ylim([0, 1.05])

plt.tight_layout()
plt.show()

# Bar plot of top 10 models
fig, ax = plt.subplots(figsize=(12, 6))

top_10 = summary_df.head(10).copy()
top_10['label'] = top_10['model'] + '\n(' + top_10['imbalance'] + ')'

x_pos = np.arange(len(top_10))
ax.bar(x_pos, top_10['final_auc'], alpha=0.7, color='steelblue')
ax.set_xticks(x_pos)
ax.set_xticklabels(top_10['label'], rotation=45, ha='right', fontsize=9)
ax.set_ylabel('AUC', fontsize=11)
ax.set_title('Top 10 Models by AUC', fontsize=12, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim([0, 1.05])

plt.tight_layout()
plt.show()

print("âœ“ Performance comparison plotted")

## 9. Save Results

In [None]:
# Create results directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = f"results/active_learning_{APPROACH}_{timestamp}"
os.makedirs(results_dir, exist_ok=True)

# Save summary
summary_df.to_csv(f"{results_dir}/summary.csv", index=False)
print(f"âœ“ Summary saved to {results_dir}/summary.csv")

# Save detailed metrics history for each model
for result in results:
    model_name = result['model_name']
    imbalance = result['imbalance_method']
    filename = f"{results_dir}/history_{model_name}_{imbalance}.csv"
    result['metrics_history'].to_csv(filename, index=False)

print(f"âœ“ {len(results)} detailed history files saved")

# Save experiment configuration
config_summary = {
    'approach': APPROACH,
    'labeled_ratio': LABELED_RATIO,
    'validation_ratio': VALIDATION_RATIO,
    'confidence_threshold': CONFIDENCE_THRESHOLD,
    'use_dynamic_threshold': USE_DYNAMIC_THRESHOLD,
    'tau_min': TAU_MIN,
    'tau_max': TAU_MAX,
    'max_epochs': MAX_EPOCHS,
    'initial_labeled': len(X_labeled),
    'validation_size': len(X_val),
    'unlabeled_pool': len(X_unlabeled),
    'test_size': len(X_test_final) if X_test_final is not None else 0,
    'total_training_time': total_training_time,
    'n_models': len(ACTIVE_MODELS),
    'n_imbalance_methods': len(IMBALANCE_METHODS),
    'n_workers': N_WORKERS
}

with open(f"{results_dir}/config_summary.yaml", 'w') as f:
    yaml.dump(config_summary, f, default_flow_style=False)

print(f"âœ“ Configuration saved to {results_dir}/config_summary.yaml")
print()
print(f"All results saved to: {results_dir}/")

## 10. Summary Report

In [None]:
print("="*70)
print(" ACTIVE LEARNING TRAINING SUMMARY")
print("="*70)
print()
print(f"Approach: {APPROACH}")
print(f"  - {'Combined train+test split' if APPROACH == 'combined' else 'Train-only split with separate test set'}")
print()
print(f"Dataset Configuration:")
print(f"  Initial Labeled: {len(X_labeled)} samples ({LABELED_RATIO:.1%})")
print(f"  Validation: {len(X_val)} samples ({VALIDATION_RATIO:.1%} of labeled)")
print(f"  Unlabeled Pool: {len(X_unlabeled)} samples")
if X_test_final is not None:
    print(f"  Final Test Set: {len(X_test_final)} samples")
print()
print(f"Active Learning Configuration:")
print(f"  Max Epochs: {MAX_EPOCHS}")
print(f"  Dynamic Threshold: {USE_DYNAMIC_THRESHOLD}")
if USE_DYNAMIC_THRESHOLD:
    print(f"    - Range: {TAU_MIN:.2f} â†’ {TAU_MAX:.2f}")
else:
    print(f"    - Static: {CONFIDENCE_THRESHOLD:.2f}")
print(f"  Imbalance on Initial: {APPLY_IMBALANCE_TO_INITIAL}")
print()
print(f"Training Configuration:")
print(f"  Models Tested: {len(ACTIVE_MODELS)}")
print(f"  Imbalance Methods: {len(IMBALANCE_METHODS)}")
print(f"  Total Jobs: {len(results)}")
print(f"  Parallel Workers: {N_WORKERS}")
print(f"  Total Training Time: {total_training_time:.2f}s ({total_training_time/60:.1f}m)")
print()
print(f"Top 5 Models:")
for i, (idx, row) in enumerate(summary_df.head(5).iterrows(), 1):
    result = results[idx]
    print(f"  {i}. {row['model']:20s} + {row['imbalance']:15s} | "
          f"AUC: {row['final_auc']:.4f} | Acc: {row['final_accuracy']:.4f} | "
          f"F1: {row['final_f1']:.4f} | Time: {row['training_time']:.1f}s")
print()
print(f"Best Model Details:")
best_result = results[summary_df.index[0]]
print(f"  Model: {best_result['model_name']}")
print(f"  Imbalance Method: {best_result['imbalance_method']}")
print(f"  Final Metrics (on {best_result['eval_set']} set):")
for metric, value in best_result['final_metrics'].items():
    if isinstance(value, (int, float)):
        print(f"    - {metric}: {value:.4f}")
print(f"  Training Statistics:")
print(f"    - Total Epochs: {best_result['total_epochs']}")
print(f"    - Final Labeled Size: {best_result['final_labeled_size']}")
print(f"    - Training Time: {best_result['training_time']:.2f}s")
print()
print(f"Results saved to: {results_dir}/")
print("="*70)