# 4.1 End-to-End Training Workflows

**Prerequisites**: Complete modules 00-03

## Learning Objectives
- Structure a complete RL training pipeline
- Implement proper experiment tracking
- Handle checkpointing and resumption
- Evaluate and compare trained policies

```
┌─────────────────────────────────────────────────────────────────────────────┐
│                        COMPLETE TRAINING WORKFLOW                           │
├─────────────────────────────────────────────────────────────────────────────┤
│                                                                             │
│   1. SETUP          2. TRAIN           3. EVALUATE       4. DEPLOY         │
│   ┌─────────┐       ┌─────────┐       ┌─────────┐       ┌─────────┐        │
│   │ Config  │       │ Loop    │       │ Test    │       │ Export  │        │
│   │ Env     │  ──>  │ Log     │  ──>  │ Compare │  ──>  │ Serve   │        │
│   │ Algo    │       │ Save    │       │ Visualize│      │ Monitor │        │
│   └─────────┘       └─────────┘       └─────────┘       └─────────┘        │
│                                                                             │
└─────────────────────────────────────────────────────────────────────────────┘
```

In [None]:
import ray
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.algorithms.algorithm import Algorithm
from ray import tune
import gymnasium as gym
import numpy as np
import json
import os
from pathlib import Path
from datetime import datetime

# M1-friendly Ray init
ray.init(
    num_cpus=4,
    object_store_memory=1 * 1024 * 1024 * 1024,
    ignore_reinit_error=True,
)

print(f"Ray initialized: {ray.cluster_resources()}")

## 1. Project Structure

A well-organized RL project:

```
my_rl_project/
├── configs/                 # Algorithm configurations
│   ├── ppo_cartpole.py
│   └── sac_pendulum.py
├── envs/                    # Custom environments
│   └── my_env.py
├── scripts/
│   ├── train.py            # Training script
│   ├── evaluate.py         # Evaluation script
│   └── serve.py            # Serving script
├── checkpoints/            # Saved models
├── results/                # Logs and metrics
└── requirements.txt
```

## 2. Configuration Management

Keep configs separate and version-controlled:

In [None]:
# Define experiment configuration as a dictionary
# This makes it easy to save, load, and modify

EXPERIMENT_CONFIG = {
    "name": "cartpole_ppo_v1",
    "env": "CartPole-v1",
    "algorithm": "PPO",
    
    # Training settings
    "training": {
        "lr": 3e-4,
        "gamma": 0.99,
        "train_batch_size": 4000,
        "sgd_minibatch_size": 128,
        "num_sgd_iter": 10,
    },
    
    # Network architecture
    "model": {
        "fcnet_hiddens": [64, 64],
        "fcnet_activation": "tanh",
    },
    
    # Workers
    "num_workers": 2,
    
    # Stopping criteria
    "stop": {
        "episode_reward_mean": 475,
        "training_iteration": 100,
    },
}

def config_to_rllib(exp_config):
    """Convert experiment config to RLlib config."""
    return (
        PPOConfig()
        .environment(exp_config["env"])
        .framework("torch")
        .env_runners(num_env_runners=exp_config["num_workers"])
        .training(
            lr=exp_config["training"]["lr"],
            gamma=exp_config["training"]["gamma"],
            train_batch_size=exp_config["training"]["train_batch_size"],
            sgd_minibatch_size=exp_config["training"]["sgd_minibatch_size"],
            num_sgd_iter=exp_config["training"]["num_sgd_iter"],
            model=exp_config["model"],
        )
    )

print(f"Experiment: {EXPERIMENT_CONFIG['name']}")
print(f"Environment: {EXPERIMENT_CONFIG['env']}")

## 3. Training with Proper Logging

```
┌─────────────────────────────────────────────────────────────────────────────┐
│                            TRAINING LOOP                                    │
├─────────────────────────────────────────────────────────────────────────────┤
│                                                                             │
│   for iteration in range(max_iters):                                        │
│       │                                                                     │
│       ├──> result = algo.train()                                            │
│       │                                                                     │
│       ├──> Log metrics (reward, loss, entropy)                              │
│       │                                                                     │
│       ├──> Check stopping criteria                                          │
│       │       └──> if solved: break                                         │
│       │                                                                     │
│       └──> Checkpoint (every N iterations)                                  │
│               └──> algo.save(checkpoint_dir)                                │
│                                                                             │
└─────────────────────────────────────────────────────────────────────────────┘
```

In [None]:
class TrainingTracker:
    """Track and log training metrics."""
    
    def __init__(self, experiment_name, log_dir="./results"):
        self.experiment_name = experiment_name
        self.log_dir = Path(log_dir) / experiment_name
        self.log_dir.mkdir(parents=True, exist_ok=True)
        
        self.metrics = []
        self.best_reward = -float('inf')
        self.best_checkpoint = None
        
    def log(self, iteration, result):
        """Log metrics from training result."""
        metrics = {
            "iteration": iteration,
            "timestamp": datetime.now().isoformat(),
            "episode_reward_mean": result["env_runners"]["episode_reward_mean"],
            "episode_reward_max": result["env_runners"]["episode_reward_max"],
            "episode_reward_min": result["env_runners"]["episode_reward_min"],
            "episodes_total": result["env_runners"]["num_episodes"],
            "timesteps_total": result.get("timesteps_total", 0),
        }
        self.metrics.append(metrics)
        return metrics
    
    def is_best(self, reward):
        """Check if this is the best reward so far."""
        if reward > self.best_reward:
            self.best_reward = reward
            return True
        return False
    
    def save_checkpoint(self, algo, checkpoint_path):
        """Save checkpoint and track best."""
        self.best_checkpoint = checkpoint_path
        
    def save_metrics(self):
        """Save all metrics to file."""
        metrics_file = self.log_dir / "metrics.json"
        with open(metrics_file, 'w') as f:
            json.dump(self.metrics, f, indent=2)
        print(f"Metrics saved to {metrics_file}")
        
    def summary(self):
        """Print training summary."""
        print("\n" + "="*60)
        print("TRAINING SUMMARY")
        print("="*60)
        print(f"Experiment: {self.experiment_name}")
        print(f"Total iterations: {len(self.metrics)}")
        print(f"Best reward: {self.best_reward:.2f}")
        print(f"Best checkpoint: {self.best_checkpoint}")
        print("="*60)

In [None]:
def train(config, max_iterations=50, checkpoint_freq=10):
    """Complete training workflow."""
    
    # Setup
    rllib_config = config_to_rllib(config)
    algo = rllib_config.build()
    tracker = TrainingTracker(config["name"])
    
    print(f"Starting training: {config['name']}")
    print(f"Stop criteria: {config['stop']}")
    print("-" * 60)
    
    # Training loop
    for i in range(max_iterations):
        # Train
        result = algo.train()
        
        # Log
        metrics = tracker.log(i, result)
        reward = metrics["episode_reward_mean"]
        
        # Print progress
        marker = ""
        if tracker.is_best(reward):
            marker = " ** NEW BEST **"
            checkpoint = algo.save(tracker.log_dir / "checkpoints")
            tracker.save_checkpoint(algo, checkpoint)
            
        if (i + 1) % 5 == 0 or marker:
            print(f"Iter {i+1:3d} | Reward: {reward:7.2f} | Episodes: {metrics['episodes_total']:5d}{marker}")
        
        # Check stopping criteria
        if reward >= config["stop"]["episode_reward_mean"]:
            print(f"\nSolved at iteration {i+1}!")
            break
            
        if i + 1 >= config["stop"]["training_iteration"]:
            print(f"\nReached max iterations.")
            break
    
    # Save final metrics
    tracker.save_metrics()
    tracker.summary()
    
    return algo, tracker

# Run training
algo, tracker = train(EXPERIMENT_CONFIG, max_iterations=30)

## 4. Evaluation

Proper evaluation of trained policies:

In [None]:
def evaluate_policy(algo, env_name, num_episodes=10, render=False):
    """Evaluate a trained policy."""
    
    env = gym.make(env_name, render_mode="human" if render else None)
    
    episode_rewards = []
    episode_lengths = []
    
    for ep in range(num_episodes):
        obs, info = env.reset()
        total_reward = 0
        steps = 0
        
        while True:
            action = algo.compute_single_action(obs)
            obs, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            steps += 1
            
            if terminated or truncated:
                break
        
        episode_rewards.append(total_reward)
        episode_lengths.append(steps)
    
    env.close()
    
    results = {
        "mean_reward": np.mean(episode_rewards),
        "std_reward": np.std(episode_rewards),
        "min_reward": np.min(episode_rewards),
        "max_reward": np.max(episode_rewards),
        "mean_length": np.mean(episode_lengths),
        "episodes": episode_rewards,
    }
    
    return results

# Evaluate
eval_results = evaluate_policy(algo, EXPERIMENT_CONFIG["env"], num_episodes=20)

print("\nEVALUATION RESULTS")
print("="*40)
print(f"Mean Reward:  {eval_results['mean_reward']:.2f} +/- {eval_results['std_reward']:.2f}")
print(f"Min Reward:   {eval_results['min_reward']:.2f}")
print(f"Max Reward:   {eval_results['max_reward']:.2f}")
print(f"Mean Length:  {eval_results['mean_length']:.1f} steps")

## 5. Checkpointing and Resumption

```
┌─────────────────────────────────────────────────────────────────────────────┐
│                          CHECKPOINT WORKFLOW                                │
├─────────────────────────────────────────────────────────────────────────────┤
│                                                                             │
│   Training crashes at iteration 50                                          │
│              │                                                              │
│              v                                                              │
│   checkpoint_iter_45/  <-- Last saved checkpoint                            │
│              │                                                              │
│              v                                                              │
│   algo = Algorithm.from_checkpoint(path)  <-- Resume!                       │
│              │                                                              │
│              v                                                              │
│   Continue from iteration 46                                                │
│                                                                             │
└─────────────────────────────────────────────────────────────────────────────┘
```

In [None]:
# Save checkpoint
checkpoint_path = algo.save("./checkpoints/my_model")
print(f"Saved to: {checkpoint_path}")

# Stop the algorithm
algo.stop()

# Later: restore from checkpoint
restored_algo = Algorithm.from_checkpoint(checkpoint_path)
print("Restored successfully!")

# Verify it works
env = gym.make("CartPole-v1")
obs, _ = env.reset()
action = restored_algo.compute_single_action(obs)
print(f"Action from restored policy: {action}")
env.close()

## 6. Comparing Multiple Runs

Compare different configurations:

In [None]:
import matplotlib.pyplot as plt

def compare_configs(configs, max_iters=20):
    """Train and compare multiple configurations."""
    
    results = {}
    
    for name, config in configs.items():
        print(f"\nTraining: {name}")
        print("-" * 40)
        
        rllib_config = config_to_rllib(config)
        algo = rllib_config.build()
        
        rewards = []
        for i in range(max_iters):
            result = algo.train()
            reward = result["env_runners"]["episode_reward_mean"]
            rewards.append(reward)
            
            if (i + 1) % 5 == 0:
                print(f"  Iter {i+1}: {reward:.2f}")
        
        results[name] = rewards
        algo.stop()
    
    return results

# Define configs to compare
configs_to_compare = {
    "small_network": {
        **EXPERIMENT_CONFIG,
        "name": "small_network",
        "model": {"fcnet_hiddens": [32, 32], "fcnet_activation": "tanh"},
    },
    "large_network": {
        **EXPERIMENT_CONFIG,
        "name": "large_network",
        "model": {"fcnet_hiddens": [128, 128], "fcnet_activation": "tanh"},
    },
}

# comparison_results = compare_configs(configs_to_compare, max_iters=15)
print("Comparison example (uncomment to run)")

In [None]:
def plot_comparison(results):
    """Plot learning curves for comparison."""
    
    plt.figure(figsize=(10, 6))
    
    for name, rewards in results.items():
        plt.plot(rewards, label=name, linewidth=2)
    
    plt.xlabel('Training Iteration')
    plt.ylabel('Mean Episode Reward')
    plt.title('Configuration Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# plot_comparison(comparison_results)
print("Plotting example (uncomment after running comparison)")

## 7. Using Ray Tune for Experiments

For more complex experiments, use Ray Tune:

In [None]:
from ray import tune

# Configure experiment with Tune
tune_config = (
    PPOConfig()
    .environment("CartPole-v1")
    .framework("torch")
    .env_runners(num_env_runners=2)
    .training(
        lr=3e-4,
        train_batch_size=4000,
    )
)

# Run with Tune for automatic logging
tuner = tune.Tuner(
    "PPO",
    param_space=tune_config,
    run_config=tune.RunConfig(
        name="ppo_cartpole_experiment",
        stop={"env_runners/episode_reward_mean": 450, "training_iteration": 30},
        checkpoint_config=tune.CheckpointConfig(
            checkpoint_frequency=5,
            checkpoint_at_end=True,
        ),
    ),
)

# results = tuner.fit()
# best = results.get_best_result(metric="env_runners/episode_reward_mean", mode="max")
# print(f"Best reward: {best.metrics['env_runners']['episode_reward_mean']}")

print("Tune experiment example (uncomment to run)")

## Key Takeaways

1. **Structure your project** - Separate configs, envs, scripts, and results

2. **Track everything** - Log metrics, save checkpoints, record configs

3. **Evaluate properly** - Multiple episodes, track variance

4. **Use checkpoints** - Save frequently, enable resumption

5. **Compare systematically** - Same seeds, same evaluation

## What's Next

```
┌──────────────────┐     ┌──────────────────┐     ┌──────────────────┐
│  05 Distributed  │     │     06 Tune      │     │  09 Robotics!    │
│     Training     │ --> │   Hyperparams    │ --> │                  │
│                  │     │                  │     │  Train an Ant    │
│    Scale up!     │     │   Find optimal   │     │    to walk!      │
└──────────────────┘     └──────────────────┘     └──────────────────┘
```

In [None]:
# Cleanup
restored_algo.stop()
ray.shutdown()
print("Done!")