# 6.1 Hyperparameter Tuning with Ray Tune

## Learning Objectives
- Understand hyperparameter search strategies
- Use Ray Tune for automated tuning
- Apply schedulers and search algorithms
- Implement Population Based Training (PBT)

In [None]:
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.search.optuna import OptunaSearch
from ray.tune.search.hyperopt import HyperOptSearch
from ray.rllib.algorithms.ppo import PPOConfig
import numpy as np
import matplotlib.pyplot as plt

ray.init(ignore_reinit_error=True)

## Why Hyperparameter Tuning?

RL algorithms are sensitive to hyperparameters:

| Hyperparameter | Too Low | Too High |
|----------------|---------|----------|
| Learning rate | Slow learning | Unstable training |
| Batch size | High variance | Slow updates |
| Discount factor (Î³) | Myopic behavior | Slow credit assignment |
| Entropy coefficient | Premature convergence | Random behavior |
| Clip parameter (PPO) | Conservative updates | Large policy changes |

## Basic Hyperparameter Search

In [None]:
# Define search space
search_space = {
    "lr": tune.loguniform(1e-5, 1e-3),           # Log-uniform: good for learning rates
    "gamma": tune.uniform(0.9, 0.999),            # Uniform: bounded range
    "clip_param": tune.choice([0.1, 0.2, 0.3]),   # Categorical choice
    "entropy_coeff": tune.loguniform(1e-4, 1e-2),
    "train_batch_size": tune.choice([2000, 4000, 8000]),
}

print("Search space defined")
print(f"Learning rate range: {1e-5} to {1e-3}")

In [None]:
# Configure PPO with tunable parameters
def create_tunable_config():
    return (
        PPOConfig()
        .environment("CartPole-v1")
        .framework("torch")
        .env_runners(num_env_runners=2)
        .training(
            lr=tune.loguniform(1e-5, 1e-3),
            gamma=tune.uniform(0.9, 0.999),
            clip_param=tune.choice([0.1, 0.2, 0.3]),
            entropy_coeff=tune.loguniform(1e-4, 1e-2),
            train_batch_size=tune.choice([2000, 4000]),
            sgd_minibatch_size=128,
            num_sgd_iter=tune.choice([5, 10, 20]),
        )
    )

config = create_tunable_config()
print("Tunable config created")

## ASHA Scheduler (Aggressive Early Stopping)

ASHA (Asynchronous Successive Halving Algorithm) stops poorly-performing trials early to focus resources on promising ones.

In [None]:
# ASHA scheduler configuration
asha_scheduler = ASHAScheduler(
    metric="env_runners/episode_reward_mean",
    mode="max",
    max_t=50,          # Max iterations
    grace_period=10,   # Min iterations before stopping
    reduction_factor=2, # Halving factor
)

# Run tuning with ASHA
tuner = tune.Tuner(
    "PPO",
    param_space=config,
    tune_config=tune.TuneConfig(
        scheduler=asha_scheduler,
        num_samples=8,  # Number of trials
        max_concurrent_trials=4,
    ),
    run_config=tune.RunConfig(
        stop={"training_iteration": 50},
        checkpoint_config=tune.CheckpointConfig(
            checkpoint_at_end=True,
        ),
    ),
)

# results = tuner.fit()  # Uncomment to run
print("ASHA tuner configured (uncomment to run)")

## Bayesian Optimization with Optuna

In [None]:
# Optuna search algorithm
optuna_search = OptunaSearch(
    metric="env_runners/episode_reward_mean",
    mode="max",
)

# Combined with ASHA for efficiency
tuner_optuna = tune.Tuner(
    "PPO",
    param_space=config,
    tune_config=tune.TuneConfig(
        search_alg=optuna_search,
        scheduler=asha_scheduler,
        num_samples=16,
    ),
    run_config=tune.RunConfig(
        stop={"training_iteration": 50},
    ),
)

print("Optuna + ASHA tuner configured")

## Population Based Training (PBT)

PBT combines hyperparameter tuning with training:
- Trains a population of agents in parallel
- Periodically copies weights from best performers
- Mutates hyperparameters to explore

In [None]:
# PBT scheduler
pbt_scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="env_runners/episode_reward_mean",
    mode="max",
    perturbation_interval=5,  # Check every 5 iterations
    hyperparam_mutations={
        # Parameters to mutate
        "lr": tune.loguniform(1e-5, 1e-3),
        "entropy_coeff": tune.loguniform(1e-4, 1e-2),
        "clip_param": [0.1, 0.2, 0.3],
    },
    quantile_fraction=0.25,  # Bottom 25% exploit top 25%
    resample_probability=0.25,  # 25% chance to resample vs perturb
)

# PBT requires fixed initial hyperparameters
pbt_config = (
    PPOConfig()
    .environment("CartPole-v1")
    .framework("torch")
    .env_runners(num_env_runners=2)
    .training(
        lr=3e-4,
        gamma=0.99,
        clip_param=0.2,
        entropy_coeff=0.01,
        train_batch_size=4000,
    )
)

tuner_pbt = tune.Tuner(
    "PPO",
    param_space=pbt_config,
    tune_config=tune.TuneConfig(
        scheduler=pbt_scheduler,
        num_samples=8,  # Population size
    ),
    run_config=tune.RunConfig(
        stop={"training_iteration": 50},
        checkpoint_config=tune.CheckpointConfig(
            checkpoint_frequency=5,
        ),
    ),
)

print("PBT tuner configured")

## Analyzing Tuning Results

In [None]:
def analyze_results(results):
    """Analyze tuning results."""
    # Get best result
    best_result = results.get_best_result(
        metric="env_runners/episode_reward_mean",
        mode="max"
    )
    
    print("Best Trial:")
    print(f"  Reward: {best_result.metrics['env_runners']['episode_reward_mean']:.2f}")
    print(f"  Config: {best_result.config}")
    
    # Get all results as dataframe
    df = results.get_dataframe()
    
    # Plot hyperparameter importance
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    params = ['lr', 'gamma', 'clip_param', 'entropy_coeff']
    for ax, param in zip(axes.flat, params):
        if param in df.columns:
            ax.scatter(df[param], df['env_runners/episode_reward_mean'])
            ax.set_xlabel(param)
            ax.set_ylabel('Reward')
            ax.set_title(f'{param} vs Reward')
            if param == 'lr':
                ax.set_xscale('log')
    
    plt.tight_layout()
    plt.show()
    
    return best_result

# Example usage (with actual results):
# best = analyze_results(results)
print("Analysis function defined")

## Running a Complete Tuning Example

In [None]:
# Small-scale tuning example (runs quickly)
quick_config = (
    PPOConfig()
    .environment("CartPole-v1")
    .framework("torch")
    .env_runners(num_env_runners=1)
    .training(
        lr=tune.grid_search([1e-4, 3e-4, 1e-3]),  # Grid search
        train_batch_size=2000,
    )
)

# Run small grid search
tuner = tune.Tuner(
    "PPO",
    param_space=quick_config,
    run_config=tune.RunConfig(
        stop={"training_iteration": 10},
        verbose=1,
    ),
)

print("Running quick grid search...")
results = tuner.fit()

# Analyze
best = results.get_best_result(
    metric="env_runners/episode_reward_mean",
    mode="max"
)
print(f"\nBest learning rate: {best.config['lr']}")
print(f"Best reward: {best.metrics['env_runners']['episode_reward_mean']:.2f}")

## Best Practices for RL Hyperparameter Tuning

### 1. Start with known good defaults
```python
# PPO defaults that often work
lr = 3e-4
gamma = 0.99
clip_param = 0.2
entropy_coeff = 0.01
```

### 2. Tune learning rate first (most impactful)

### 3. Use log-uniform for rates/coefficients

### 4. PBT is best for RL (adapts during training)

### 5. Always set reasonable stopping criteria

In [None]:
# Production-ready tuning template
def production_tune(env_name, num_samples=32, max_iters=100):
    """Production hyperparameter tuning setup."""
    
    config = (
        PPOConfig()
        .environment(env_name)
        .framework("torch")
        .env_runners(num_env_runners=4)
        .training(
            lr=tune.loguniform(1e-5, 1e-3),
            gamma=tune.uniform(0.95, 0.999),
            clip_param=tune.uniform(0.1, 0.3),
            entropy_coeff=tune.loguniform(1e-4, 1e-1),
            vf_loss_coeff=tune.uniform(0.5, 1.0),
            train_batch_size=tune.choice([4000, 8000, 16000]),
            sgd_minibatch_size=tune.choice([128, 256, 512]),
            num_sgd_iter=tune.choice([5, 10, 20, 30]),
            lambda_=tune.uniform(0.9, 1.0),
        )
    )
    
    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="env_runners/episode_reward_mean",
        mode="max",
        perturbation_interval=10,
        hyperparam_mutations={
            "lr": tune.loguniform(1e-5, 1e-3),
            "entropy_coeff": tune.loguniform(1e-4, 1e-1),
        },
    )
    
    tuner = tune.Tuner(
        "PPO",
        param_space=config,
        tune_config=tune.TuneConfig(
            scheduler=scheduler,
            num_samples=num_samples,
        ),
        run_config=tune.RunConfig(
            stop={"training_iteration": max_iters},
            checkpoint_config=tune.CheckpointConfig(
                checkpoint_frequency=10,
                checkpoint_at_end=True,
            ),
        ),
    )
    
    return tuner

print("Production tuning template defined")

## Key Takeaways

1. **ASHA** is efficient for quick exploration with early stopping

2. **Optuna/HyperOpt** provide intelligent Bayesian search

3. **PBT** is ideal for RL (adapts hyperparameters during training)

4. **Learning rate** is usually the most important hyperparameter

## Next Steps

In the next section, we'll cover production deployment strategies.

In [None]:
ray.shutdown()