Goal & rewards: move right across rough terrain; forward progress gives reward, energy use is slightly penalized, falling is −100. “Solved” is ≈ 300 return (normal) within the time limit. 
Gymnasium
+1

Variants: BipedalWalker-v3 (normal) and BipedalWalkerHardcore-v3 (obstacles). Try the normal one first; Hardcore is much tougher. 
Gymnasium

Gotchas: if Box2D import fails, re-check swig + gymnasium[box2d] install (Windows may need build tools). 
Gymnasium
+1

In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.monitor import Monitor

import time

In [2]:
# Load the environment and doa trial run with random actions

env = gym.make("BipedalWalker-v3", render_mode="human")
obs, info = env.reset()
terminated = truncated = False

while not (terminated or truncated):
    action = env.action_space.sample()     # <- random, untrained
    obs, reward, terminated, truncated, info = env.step(action)
    time.sleep(0.01)                       # slow down a bit to watch
env.close()




In [None]:
# Load the environment and run it with an untrained PPO model

env = gym.make("BipedalWalker-v3", render_mode="human")
model = PPO("MlpPolicy", env, verbose=0)   # untrained policy

obs, info = env.reset()
terminated = truncated = False

while not (terminated or truncated):
    action, _ = model.predict(obs, deterministic=False)  # policy with random init
    obs, reward, terminated, truncated, info = env.step(action)
env.close()


In [None]:
# Train the PPO model to get an optimal policy

ENV_ID = "BipedalWalker-v3"            # try "BipedalWalkerHardcore-v3" later

def make_env():
    # render_mode=None for training speed; use "human" or "rgb_array" to visualize
    return Monitor(gym.make(ENV_ID))

# parallel envs speed training
venv = SubprocVecEnv([make_env for _ in range(8)])

model = PPO(
    "MlpPolicy",
    venv,
    n_steps=2048,            # rollout length per env
    batch_size=64,           # minibatch size
    n_epochs=10,             # epochs per update
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.0,
    vf_coef=0.5,
    learning_rate=3e-4,
    verbose=0,
    
)

model.learn(total_timesteps=2_000_000,progress_bar=True) 
model.save("ppo_bipedalwalker_v3")
venv.close()


Output()



In [None]:
# Evaluate th policy

env = gym.make("BipedalWalker-v3", render_mode="human")
model = PPO.load("ppo_bipedalwalker_v3")

obs, info = env.reset()
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
env.close()
