Run the below in bash

`tensorboard --logdir=./results`

In [25]:
from ray import train, tune
from ray.rllib.algorithms.ppo import PPOConfig
import os

config = (
    PPOConfig()
    .environment("CartPole-v1")
    # Specify a simple tune hyperparameter sweep.
    .training(
        num_epochs = 10,
    )
)

# Create a Tuner instance to manage the trials.
tuner = tune.Tuner(
    config.algo_class,
    param_space=config,
    run_config=train.RunConfig(
        storage_path=os.path.abspath("./results/tutorial"),
        stop={"env_runners/episode_return_mean": 500.0},
        checkpoint_config=tune.CheckpointConfig(checkpoint_frequency=10,
                                                checkpoint_at_end=True,
        ),
    ),
)
# Run the Tuner and capture the results.
results = tuner.fit()

0,1
Current time:,2025-09-27 16:08:22
Running for:,00:02:00.94
Memory:,22.2/125.7 GiB

Trial name,status,loc,iter,total time (s),num_training_step_ca lls_per_iteration,num_env_steps_sample d_lifetime
PPO_CartPole-v1_33793_00000,TERMINATED,10.0.0.38:263830,32,112.904,1,128000


2025-09-27 16:08:22,606	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/media/ash/Storage/Projects/RL/results/tutorial/PPO_2025-09-27_16-06-21' in 0.0161s.
2025-09-27 16:08:23,050	INFO tune.py:1041 -- Total run time: 121.39 seconds (120.93 seconds for the tuning loop).


### Loading a Trained Model (Checkpoint)
https://docs.ray.io/en/latest/rllib/getting-started.html#deploy-a-trained-model-for-inference

In [32]:
import os
from pathlib import Path
from ray.rllib.core.rl_module import RLModule
import gymnasium as gym
import torch
import numpy as np

checkpoint_path = os.path.abspath("./results/tutorial/PPO_2025-09-27_16-06-21/PPO_CartPole-v1_33793_00000_0_2025-09-27_16-06-21/checkpoint_000003")

rl_module = RLModule.from_checkpoint(
    Path(checkpoint_path)
    / "learner_group"
    / "learner"
    / "rl_module"
    / "default_policy"
)

In [36]:
env = gym.make("CartPole-v1", render_mode="human")

episode_return = 0.0
done = False

# Reset the env to get the initial observation.
obs, info = env.reset()

while not done:
    # Uncomment this line to render the env.
    # env.render()

    # Compute the next action from a batch (B=1) of observations.
    obs_batch = torch.from_numpy(obs).unsqueeze(0)  # add batch B=1 dimension
    model_outputs = rl_module.forward_inference({"obs": obs_batch})

    # Extract the action distribution parameters from the output and dissolve batch dim.
    action_dist_params = model_outputs["action_dist_inputs"][0].numpy()

    # # We have continuous actions -> take the mean (max likelihood).
    # greedy_action = np.clip(
    #     action_dist_params[0:1],  # 0=mean, 1=log(stddev), [0:1]=use mean, but keep shape=(1,)
    #     a_min=env.action_space.low[0],
    #     a_max=env.action_space.high[0],
    # )
    # For discrete actions, you should take the argmax over the logits:
    greedy_action = np.argmax(action_dist_params)

    # Send the action to the environment for the next step.
    obs, reward, terminated, truncated, info = env.step(greedy_action)

    # Perform env-loop bookkeeping.
    episode_return += reward
    done = terminated or truncated

env.close()
print(f"Reached episode return of {episode_return}.")

error: display Surface quit