# **World Models - Controller (C)**

## **Introduction**

This notebook implements the Controller (C) component of the World Models architecture. In the World Model framework, the agent consists of three components:
* **Vision (V)**: A Variational Autoencoder (VAE) that compresses visual input into a latent representation
* **Memory (M)**: A recurrent network (LSTM-MDN) that predicts future states
* **Controller (C)**: A neural network that takes latent state and hidden state to output actions

The Controller is trained using an evolutionary strategy (CMA-ES) to optimize performance in the CarRacing environment.

<img src="imgs/world model framework.png" width="800" alt="World Models Architecture Diagram">

## **Setup and Imports**
First, we'll import the necessary libraries and load our pretrained Vision and Memory models.

In [1]:
import sys
import os
from pathlib import Path


# Get the absolute path to the parent directory of the notebook
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

path = str(Path().cwd().parent)  

In [2]:
import os
import time
import torch
import numpy as np
from tqdm import tqdm
import gymnasium as gym
from gymnasium.vector import AsyncVectorEnv

# Project imports
from src.models.dds_vae import Vision
from src.models.controller import Controller
from src.models.mdn_rnn import MDNRNN

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### **Loading Pre-trained Vision (VAE) and Memory (MDN-LSTM) Models**

We'll load our previously trained models that form the Vision and Memory components of our World Model.

In [3]:
# Constants
LATENT_DIM = 32
HIDDEN_DIM = 256
ACTION_DIM = 3

# Load Vision model
vision = Vision(
    n_features_to_select=0.03,
    in_ch=3,
    out_ch=3,
    latent_dim=LATENT_DIM,
    base_ch=16,
    alpha=1.0,
    delta=0.1
).to(device)
vision.load_state_dict(torch.load(path+'/src/trained_models/vision_03_miniVAE.pth', map_location=device, weights_only=True))
vision.eval()

# Load Memory model
memory = MDNRNN(
    latent_dim=LATENT_DIM,
    action_dim=ACTION_DIM,
    hidden_dim=HIDDEN_DIM,
    num_gaussians=5,
).to(device)
memory.load_state_dict(torch.load(path+'/src/trained_models/memory.pth', map_location=device, weights_only=True))
memory.eval()

MDNRNN(
  (rnn): RNN(
    (lstm): LSTM(35, 256, batch_first=True)
  )
  (mdn): MDN(
    (fc_pi): Linear(in_features=256, out_features=5, bias=True)
    (fc_mu): Linear(in_features=256, out_features=160, bias=True)
    (fc_sigma): Linear(in_features=256, out_features=160, bias=True)
  )
)

In [4]:
from torch.nn import functional as F
from torch.nn.utils import parameters_to_vector
import logging

# -------------------- Encoding & Decoding ----------------------
def encode_obs_batch(vision: Vision, obs_batch: np.ndarray, size: tuple = (96, 96)) -> torch.Tensor:
    """
    Encode batch of observations into latent space.

    :param vision: Pretrained Vision model.
    :param obs_batch: Batch of observations.
    :param size: Target size for resizing.
    :return: Latent representation (z).
    """
    obs_tensor = torch.as_tensor(obs_batch, dtype=torch.float32, device=device)
    obs_tensor = obs_tensor.permute(0, 3, 1, 2) / 255.0
    obs_tensor = obs_tensor[:, :, :-12, :]                  # Crop bottom
    obs_tensor = F.interpolate(obs_tensor, size=size, mode='bilinear')

    with torch.no_grad():
        _, _, z = vision.encode(obs_tensor)

    return z

def decode_obs(z: torch.Tensor, vision: Vision) -> np.ndarray:
    """
    Decode latent space representation back to an image.

    :param z: Latent representation.
    :param vision: Pretrained Vision model.
    :return: Decoded image as a NumPy array.
    """
    with torch.no_grad():
        x_hat, _, _ = vision.decode(z)

    return x_hat.squeeze(0).permute(1, 2, 0).cpu().detach().numpy()

def make_env(name='CarRacing-v3'):
    """Create and wrap environment."""
    def _init():
        env = gym.make(
            name, 
            render_mode='rgb_array', 
            lap_complete_percent=1.0, 
            domain_randomize=False, 
            continuous=True
        )
        return env
    return _init

def create_vector_envs(num_envs):
    """Create vectorized environment."""
    return AsyncVectorEnv([make_env() for _ in range(num_envs)], shared_memory=True)

def reset(envs, num_steps=50, action_dim=3):
    """Reset environments and perform a number of no-op steps."""
    obs, _ = envs.reset()
    for step in range(num_steps):
        actions = np.zeros((envs.num_envs, action_dim))
        obs, rewards, dones, truncs, infos = envs.step(actions)
        
        if np.any(dones | truncs):
            reset_obs, _ = envs.reset_done()
            obs = reset_obs
            logging.debug(f"Step {step}: Reset {np.sum(dones | truncs)} environments.")
    return envs, obs

# ----------------------------------------------------------------
# Controllers and Policy Evaluation
# ----------------------------------------------------------------
def process_actions(controllers, x):
    """Process actions for all controllers at once."""
    return torch.stack([ctrl.get_action(x[i:i+1]) for i, ctrl in enumerate(controllers)], dim=0)

def load_weights(controller_class, solutions):
    """Load CMA-ES solutions into controller weights."""
    controllers = []
    with torch.no_grad():
        for params in solutions:
            ctrl = controller_class(state_dim=LATENT_DIM + HIDDEN_DIM, action_dim=ACTION_DIM).to(device)
            torch.nn.utils.vector_to_parameters(torch.tensor(params, dtype=torch.float32).to(device), ctrl.parameters())
            controllers.append(ctrl)
    return controllers

def evaluate_policies(solutions, controller_class, max_steps, memory, vision):
    """Evaluate multiple policies in parallel via AsyncVectorEnv."""
    num_policies = len(solutions)
    controllers = load_weights(controller_class, solutions)
    envs = create_vector_envs(num_envs=num_policies)
    # obs, _ = envs.reset()
    envs, obs = reset(envs)
    
    hidden = memory.rnn.init_hidden(num_policies, 'cuda')
    cumulative_rewards = np.zeros(num_policies)
    dones = np.full(num_policies, False)

    with torch.no_grad():
        for _ in range(max_steps):
            if np.all(dones):
                break
            z_batch = encode_obs_batch(vision, obs)
            h = hidden[0].squeeze(0)
            x = torch.cat([z_batch, h], dim=-1)
            actions = process_actions(controllers, x)
            obs, rewards, dones_new, _, _ = envs.step(actions.detach().cpu().numpy())
            
            z_batch = z_batch.unsqueeze(1)
            actions = actions.unsqueeze(1)
            _, hidden = memory.rnn(z_batch, actions, hidden)
            
            dones = np.logical_or(dones, dones_new)
            cumulative_rewards += rewards * (~dones) 
    envs.close()
    return cumulative_rewards.tolist()




## **Controller Architecture**
---

The Controller processes two inputs:
1. Latent vector `z` (32-dim) from the Vision model (VAE)
2. Hidden state `h` (256-dim) from the Memory model (LSTM-MDN)

It outputs three continuous actions for the CarRacing environment:
- **Steering**: Range [-1, 1] (using tanh activation)
- **Acceleration**: Range [0, 1] (using sigmoid activation)
- **Brake**: Range [0, 1] (using sigmoid activation, limited to 0.8 max)

## **Parallel Training Strategy**
---

<img src='imgs/batch_training_conrtoller.png' width=950>

Our implementation uses parallel environments to efficiently evaluate multiple controllers:

1. **CPU Parallelization**:
   - Multiple gym environments run in parallel on CPU cores
   - Each environment evaluates a different controller instance

2. **Batch Processing Pipeline**:
   - Stack observations from all environments
   - Transfer batch to GPU for Vision and Memory processing
   - Process through all controllers in parallel
   - Return predicted actions back to CPU for environment steps

This approach significantly speeds up training by:
- Utilizing multiple CPU cores for environment simulation
- Maximizing GPU utilization through batch processing
- Reducing CPU-GPU transfers

## **CMA-ES Training**
---

CMA-ES (Covariance Matrix Adaptation Evolution Strategy) is particularly well-suited for training our controller because:

1. **No Gradient Requirements**: Works well for non-differentiable fitness functions like game scores
2. **Exploration Efficiency**: Adapts its search distribution to the fitness landscape
3. **Parallel Evaluation**: Naturally supports parallel fitness evaluation of solutions

### Key Parameters:
- Population size: 16 (matches CPU cores)
- Evaluations per controller: 7 (balance between reliability and speed)
- Initial sigma: 0.1 (exploration factor)
- Generations: 200 (training iterations)

In [6]:
from src.utils.plots import plot_cma_es_results
import pandas as pd
metrics = pd.read_csv(path+'/src/cma_es_metrics.csv')
plot_cma_es_results(metrics)

## **Visualizing Trained Controller**
---

Let's visualize how our trained controller performs in the environment:

In [6]:

def render_policy(env_name, vision, controller, mdnrnn, encode_obs_batch):
    """Render a policy in the environment."""
    env = gym.make(env_name, render_mode='human', lap_complete_percent=1.0)

    done = False
    cumulative_reward = 0
    obs, _ = env.reset()
    
    # Initialize hidden state
    h = (torch.zeros(1, HIDDEN_DIM).to(device),
         torch.zeros(1, HIDDEN_DIM).to(device))
    step_count = 1
    
    while True:
        # Encode observation to latent space
        z = encode_obs_batch(vision, obs[np.newaxis, ...])

        # Combine latent and hidden state
        x = torch.cat([z, h[0]], dim=-1)
        
        # Get action from controller
        a = controller.get_action(x)
           
        # Step environment
        obs, reward, done, _, _ = env.step(a.detach().cpu().numpy())
        env.render()
        
        # Update LSTM hidden state
        _, h = mdnrnn.rnn(z, a.unsqueeze(0), h=h)
    
        cumulative_reward += reward
        step_count += 1
        
        # End episode on completion or timeout
        if done or step_count >= 1000:
            break
    
    env.close()
    print(f'Reward: {cumulative_reward:.2f} | Steps: {step_count}')

# Load and evaluate best controller
controller = Controller(LATENT_DIM + HIDDEN_DIM, ACTION_DIM).to(device)
controller.load_state_dict(torch.load(path+'/src/trained_models/controller.pth', map_location=device, weights_only=True))
controller.eval()

# Visualize the trained controller in action
render_policy('CarRacing-v3', vision, controller, memory, encode_obs_batch)

Reward: 910.20 | Steps: 899


## **Final Evaluation**
---

Let's perform a comprehensive evaluation of our controller over 100 episodes:

In [None]:

def final_evaluation(controller_class, best_solution, memory, 
                     parallel_rollouts=7, max_steps=1000, popsize=16):
    """Comprehensive evaluation of the best controller across multiple episodes."""
    final_rewards = []
    best_controllers = [best_solution for _ in range(popsize)] 
    
    # Execute multiple evaluation rollouts
    for i in tqdm(range(parallel_rollouts)):
        rewards = evaluate_policies(best_controllers, controller_class, 
                                  max_steps, memory, vision)
        final_rewards.append(rewards)

    # Flatten rewards list and calculate statistics
    all_rewards = np.array(final_rewards).flatten()
    print(f"Performance over {popsize*parallel_rollouts} episodes: "
          f"{np.mean(all_rewards):.2f} ± {np.std(all_rewards):.2f}")
    return all_rewards

# Evaluation parameters
popsize = 16
parallel_rollouts = 8



best_solution = parameters_to_vector(
    controller.parameters()).cpu().detach().numpy()

# Run final evaluation
rewards = final_evaluation(Controller, best_solution, memory, 
                         parallel_rollouts=parallel_rollouts, 
                         max_steps=1000, popsize=popsize)