Run the below in bash

`tensorboard --logdir=./results`

### Deploy Trained Model For Inference (Checkpoint)
https://docs.ray.io/en/latest/rllib/getting-started.html#deploy-a-trained-model-for-inference

In [None]:
import os
from pathlib import Path
from ray.rllib.core.rl_module import RLModule
import gymnasium as gym
import torch
import numpy as np



In [79]:
checkpoint_path = os.path.abspath("./results/LundarLander/PPO_2025-09-27_16-23-24/PPO_LunarLanderContinuous-v3_96592_00000_0_2025-09-27_16-23-26/checkpoint_000298")

rl_module = RLModule.from_checkpoint(
    Path(checkpoint_path)
    / "learner_group"
    / "learner"
    / "rl_module"
    / "default_policy"
)


### Statistics

In [80]:
# Create the RL environment to test against (same as was used for training earlier).
env = gym.make("LunarLander-v3", gravity=-9.81, wind_power=15.0, enable_wind=False, continuous=True)

ave_returns = 0.0
trails = 300

for i in range(trails):
    episode_return = 0.0
    done = False

    # Reset the env to get the initial observation.
    obs, info = env.reset()

    while not done:
        # Uncomment this line to render the env.
        # env.render()

        # Compute the next action from a batch (B=1) of observations.
        obs_batch = torch.from_numpy(obs).unsqueeze(0)  # add batch B=1 dimension
        model_outputs = rl_module.forward_inference({"obs": obs_batch})

        # Extract the action distribution parameters from the output and dissolve batch dim.
        action_dist_params = model_outputs["action_dist_inputs"][0].numpy()

        # We have continuous actions -> take the mean (max likelihood).
        greedy_action_0 = np.clip(
            action_dist_params[0],  # 0-1=mean, 2-3=log(stddev)
            a_min=env.action_space.low[0],
            a_max=env.action_space.high[0],
        )
        greedy_action_1 = np.clip(
            action_dist_params[1],  # 0-1=mean, 2-3=log(stddev)
            a_min=env.action_space.low[1],
            a_max=env.action_space.high[1],
        )
        greedy_action = np.array([greedy_action_0, greedy_action_1])
        # # For discrete actions, you should take the argmax over the logits:
        # greedy_action = np.argmax(action_dist_params)

        # Send the action to the environment for the next step.
        obs, reward, terminated, truncated, info = env.step(greedy_action)

        # Perform env-loop bookkeeping.
        episode_return += reward
        done = terminated or truncated

    ave_returns += episode_return

    if i%10 == 9:
        print(str(i+1) + "/" + str(trails) + ": " + str(f"{ave_returns/(i+1):3.2f}"))

print(f"Average episode return of {ave_returns/trails:3.2f}.")


env.close()

10/300: 267.64
20/300: 260.72
30/300: 258.06
40/300: 261.35
50/300: 261.16
60/300: 260.48
70/300: 260.64
80/300: 262.80
90/300: 262.38
100/300: 261.95
110/300: 260.67
120/300: 261.67
130/300: 261.08
140/300: 260.97
150/300: 261.44
160/300: 261.59
170/300: 261.42
180/300: 261.04
190/300: 260.80
200/300: 260.80
210/300: 260.33
220/300: 260.90
230/300: 260.92
240/300: 260.25
250/300: 260.24
260/300: 260.19
270/300: 260.05
280/300: 260.64
290/300: 260.89
300/300: 260.64
Average episode return of 260.64.


### Visualize the Run

In [81]:
# Create the RL environment to test against (same as was used for training earlier).
env = gym.make("LunarLander-v3", render_mode="human", gravity=-9.81, wind_power=15.0, enable_wind=False, continuous=True)

episode_return = 0.0
done = False

# Reset the env to get the initial observation.
obs, info = env.reset()

while not done:
    # Uncomment this line to render the env.
    # env.render()

    # Compute the next action from a batch (B=1) of observations.
    obs_batch = torch.from_numpy(obs).unsqueeze(0)  # add batch B=1 dimension
    model_outputs = rl_module.forward_inference({"obs": obs_batch})

    # Extract the action distribution parameters from the output and dissolve batch dim.
    action_dist_params = model_outputs["action_dist_inputs"][0].numpy()

    # We have continuous actions -> take the mean (max likelihood).
    greedy_action_0 = np.clip(
        action_dist_params[0],  # 0-1=mean, 2-3=log(stddev)
        a_min=env.action_space.low[0],
        a_max=env.action_space.high[0],
    )
    greedy_action_1 = np.clip(
        action_dist_params[1],  # 0-1=mean, 2-3=log(stddev)
        a_min=env.action_space.low[1],
        a_max=env.action_space.high[1],
    )
    greedy_action = np.array([greedy_action_0, greedy_action_1])
    # # For discrete actions, you should take the argmax over the logits:
    # greedy_action = np.argmax(action_dist_params)

    # Send the action to the environment for the next step.
    obs, reward, terminated, truncated, info = env.step(greedy_action)

    # Perform env-loop bookkeeping.
    episode_return += reward
    done = terminated or truncated

print(f"Reached episode return of {episode_return}.")


env.close()

Reached episode return of 309.9099581697705.
