Run the below in bash

`tensorboard --logdir=./results`

In [1]:
from ray import train, tune
from ray.rllib.algorithms.ppo import PPOConfig
import os
from pathlib import Path
import gymnasium as gym
import numpy as np
import torch
from ray.rllib.core.rl_module import RLModule
from ray.tune.registry import register_env
from gymnasium.envs.box2d import LunarLander

In [2]:
class CustomLunarLander(LunarLander):
    def __init__(self, gravity=-9.81, wind_power=15.0, enable_wind=False, continuous=True, **kwargs):
        self.custom_gravity = gravity
        self.custom_wind_power = wind_power
        self.custom_enable_wind = enable_wind

        # Pass the continuous flag correctly
        super().__init__(continuous=continuous, **kwargs)

    def reset(self, **kwargs):
        # Inject custom parameters before calling base reset
        self.world.gravity = (0, self.custom_gravity)
        self.wind_power = self.custom_wind_power
        self.enable_wind = self.custom_enable_wind

        return super().reset(**kwargs)

register_env("ContinuousLunarLander", CustomLunarLander)

In [None]:

config = (
    PPOConfig()
    .environment("LunarLanderContinuous-v3")
    # Specify a simple tune hyperparameter sweep.
    .training(
        lr=0.0005,
        lambda_=0.99,
        grad_clip=0.2,
        train_batch_size = 10000,
        minibatch_size=512,
        num_epochs = 10,
    )
    .learners(
        num_learners=15,  # Set this to the desired number of remote Learner actors
        num_cpus_per_learner=1,
        num_gpus_per_learner=0 # Set this to 1 if using GPUs, or 0 for CPU-only training
    )
)


In [4]:

# Create a Tuner instance to manage the trials.
tuner = tune.Tuner(
    config.algo_class,
    param_space=config,
    run_config=train.RunConfig(
        storage_path=os.path.abspath("./results/LundarLander"),
        stop={"env_runners/episode_return_mean": 250.0,
              "training_iteration": 5000},
        checkpoint_config=tune.CheckpointConfig(checkpoint_frequency=10,
                                                checkpoint_at_end=True,
        ),
    ),
)
# Run the Tuner and capture the results.
results = tuner.fit()

0,1
Current time:,2025-09-27 21:24:03
Running for:,05:00:36.68
Memory:,24.4/125.7 GiB

Trial name,status,loc,iter,total time (s),num_training_step_ca lls_per_iteration,num_env_steps_sample d_lifetime
PPO_LunarLanderContinuous-v3_96592_00000,RUNNING,10.0.0.38:298123,4074,17784,1,40699300.0


[36m(PPO pid=298123)[0m [2025-09-27 16:23:29,801 E 298123 298123] core_worker.cc:2246: Actor with class name: 'SingleAgentEnvRunner' and ID: '633dcf8c75875230f287465101000000' has constructor arguments in the object store and max_restarts > 0. If the arguments in the object store go out of scope or are lost, the actor restart will fail. See https://github.com/ray-project/ray/issues/53727 for more details.
[36m(_WrappedExecutable pid=298541)[0m Setting up process group for: env:// [rank=0, world_size=15]


[36m(_WrappedExecutable pid=298547)[0m [Gloo] Rank 2 is connected to 14 peer ranks. Expected number of connected peer ranks is : 14


[36m(PPO(env=LunarLanderContinuous-v3; env-runners=2; learners=15; multi-agent=False) pid=298123)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/media/ash/Storage/Projects/RL/results/LundarLander/PPO_2025-09-27_16-23-24/PPO_LunarLanderContinuous-v3_96592_00000_0_2025-09-27_16-23-26/checkpoint_000000)
[36m(PPO(env=LunarLanderContinuous-v3; env-runners=2; learners=15; multi-agent=False) pid=298123)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/media/ash/Storage/Projects/RL/results/LundarLander/PPO_2025-09-27_16-23-24/PPO_LunarLanderContinuous-v3_96592_00000_0_2025-09-27_16-23-26/checkpoint_000001)
[36m(PPO(env=LunarLanderContinuous-v3; env-runners=2; learners=15; multi-agent=False) pid=298123)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/media/ash/Storage/Projects/RL/results/LundarLander/PPO_2025-09-27_16-23-24/PPO_LunarLanderContinuous-v3_96592_00000_0_2025-09-27_16-23-26/checkpoint_000002)
[36m(P