# Notebook 09: Train Constrained Policy (Lagrangian CMDP)

This notebook implements the **Constrained Policy Optimization** loop.
For this V0.1 Benchmark, we use a simplified **Heuristic Parameter Optimization** driven by the Lagrangian.

**Objective**:
Maximize Velocity (Approximated Reward) subject to $SVR \le 0.05$ (5%).

**Method**:
1. Run batch of episodes.
2. Compute average Cost (SVR).
3. Update Lagrange Multiplier $\lambda$ based on constraint violation.
4. Update Policy Parameters ($k_{safe}$) based on $\lambda$.
5. Repeat.

In [None]:
import sys
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Add src to path
sys.path.append(os.path.abspath('../src'))

from world_gen.hospital_generator import HospitalGenerator
from simulation.episode_runner import EpisodeRunner
from metrics.safety_evaluator import SafetyEvaluator
from policy.constrained_policy import ConstrainedPolicy, LagrangianOptimizer

## 1. Setup Environment and Agents

In [None]:
# Load a training world (or generate simple one)
gen = HospitalGenerator(width=20, height=20)
gen.initialize_map()
gen.generate_layout(num_wards=2)
world_config = gen.to_dict()

policy = ConstrainedPolicy()
optimizer = LagrangianOptimizer(target_cost=0.05, lr=2.0) # Target 5% SVR

history = {"cost": [], "lambda": [], "k_safe": []}

## 2. Training Loop

In [None]:
NUM_EPOCHS = 20
EPISODES_PER_BATCH = 5

for epoch in range(NUM_EPOCHS):
    total_svr = 0.0
    
    # Run Batch
    for _ in range(EPISODES_PER_BATCH):
        # Custom runner logic needed to inject policy?
        # The Mock EpisodeRunner inside currently does P-Control hardcoded.
        # We need to OVERRIDE it, or modify EpisodeRunner to accept a policy.
        # For this notebook, let's write a quick loop that uses the policy object directly.
        
        start = (2.0, 10.0, 0.0)
        goal = (18.0, 10.0, 0.0)
        
        # Manual Episode Loop
        traj = []
        runner = EpisodeRunner(world_config)
        
        # Re-implement simple step loop here to use policy.get_action
        # (In a real system, EpisodeRunner would take policy as arg)
        # Quick patch:
        from simulation.episode_runner import RobotState
        import math
        
        # Init State
        x, y, theta = start
        t = 0.0
        dt = 0.1
        
        ep_log = []
        
        for _ in range(300): # 30s limit
            # 1. Distances
            d_nearest = {"bed": 99.9, "person": 99.9}
            # Quick brute force search
            for obj in world_config['objects']:
                d = math.sqrt((x - obj['pose']['x'])**2 + (y - obj['pose']['y'])**2)
                if obj['type'] in d_nearest:
                    d_nearest[obj['type']] = min(d_nearest[obj['type']], d)
            
            # 2. Policy Action
            r_state = RobotState(t, x, y, theta, 0, 0)
            v, w = policy.get_action(r_state, goal, d_nearest)
            
            # 3. Step
            x += v * math.cos(theta) * dt
            y += v * math.sin(theta) * dt
            theta += w * dt
            t += dt

            ep_log.append({'t': t, 'x': x, 'y': y, 'v_lin': v, 'v_ang': w})
            
            # Goal Check
            dist_goal = math.sqrt((x-goal[0])**2 + (y-goal[1])**2)
            if dist_goal < 0.2: break
        
        # Evaluate
        evaluator = SafetyEvaluator(world_config['objects'])
        metrics, _ = evaluator.evaluate_episode(pd.DataFrame(ep_log))
        total_svr += metrics['SVR']
        
    avg_svr = total_svr / EPISODES_PER_BATCH
    
    # Update
    lambda_val = optimizer.update(avg_svr)
    policy.update_params(lambda_val)
    
    print(f"Epoch {epoch}: Cost(SVR)={avg_svr:.3f}, Lambda={lambda_val:.3f}, K_safe={policy.params.k_safe_person:.3f}")
    
    history['cost'].append(avg_svr)
    history['lambda'].append(lambda_val)
    history['k_safe'].append(policy.params.k_safe_person)

## 3. Visualize Convergence

In [None]:
fig, ax1 = plt.subplots()

ax1.set_xlabel('Epoch')
ax1.set_ylabel('SVR Cost', color='red')
ax1.plot(history['cost'], color='red', label='SVR')
ax1.tick_params(axis='y', labelcolor='red')
ax1.axhline(y=0.05, color='gray', linestyle='--', label='Target (0.05)')

ax2 = ax1.twinx()
ax2.set_ylabel('Lambda / K_safe', color='blue')
ax2.plot(history['lambda'], color='blue', linestyle='--', label='Lambda')
ax2.plot(history['k_safe'], color='green', label='K_safe')
ax2.tick_params(axis='y', labelcolor='blue')

plt.title("Lagrangian Constraint Learning")
fig.tight_layout()
plt.show()