# Bipedal walker
https://gymnasium.farama.org/environments/box2d/bipedal_walker/

## Setup

In [2]:
!pip install gymnasium[mujoco]

Collecting mujoco>=2.1.5 (from gymnasium[mujoco])
  Downloading mujoco-3.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
Collecting imageio>=2.14.1 (from gymnasium[mujoco])
  Downloading imageio-2.36.0-py3-none-any.whl.metadata (5.2 kB)
Collecting etils[epath] (from mujoco>=2.1.5->gymnasium[mujoco])
  Downloading etils-1.10.0-py3-none-any.whl.metadata (6.5 kB)
Collecting glfw (from mujoco>=2.1.5->gymnasium[mujoco])
  Downloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting pyopengl (from mujoco>=2.1.5->gymnasium[mujoco])
  Downloading PyOpenGL-3.1.7-py3-none-any.whl.metadata (3.2 kB)
Collecting importlib_resources (from etils[epath]->mujoco>=2.1.5->gymnasium[mujoco])
  Downloading importlib_resources-6.4.5-py3-none-any.whl.metadata (4.0 kB)
Collecting zipp (from etils[epath]->mujoco>=2.1.5->gymnasium[mujoco])
  Downloading zipp-3.21.0-py3-none-any.

In [4]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback
import numpy as np

name = 'ppo_ant_v1'
env_id = "Ant-v5"
n_envs = 4

policy = 'MlpPolicy'
tensorboard_log = f"./{name}/t_logs/"
path = f"./{name}/model/"
device = 'cpu'

class RewardShapedBipedalWalker(gym.Wrapper):
    def __init__(self, env):
        super(RewardShapedBipedalWalker, self).__init__(env)
        self.previous_x = None

    def reset(self, **kwargs):
        self.previous_x = None
        return self.env.reset(**kwargs)

    def step(self, action):
        obs, original_reward, done, trunc, info = self.env.step(action)
        
        # Forward progress reward
        # current_x = self.env.hull.position[0]  # x-position of the agent
        # forward_reward = (current_x - self.previous_x) if self.previous_x is not None else 0
        # self.previous_x = current_x

        # # Energy penalty
        # energy_penalty = np.sum(np.square(action)) * 0.001

        # # Stability penalty (penalize large torso angles)
        # angle_penalty = abs(self.env.hull.angle) * 0.5
        vertical_speed_reward = obs[0]

        # Combine the rewards
        shaped_reward = original_reward #+ vertical_speed_reward

        return obs, shaped_reward, done, trunc, info

def make_env(render_mode:str='rgb_array'):
    e = gym.make(env_id, render_mode=render_mode)
    e = RewardShapedBipedalWalker(e)
    return e

env = make_vec_env(make_env, n_envs)


class SaveOnStep(BaseCallback):
    def __init__(self, steps: int, path: str, verbose: int = 0):
        super().__init__(verbose)
        self.steps = steps
        self.save_path = path

    def _on_step(self) -> bool:
        # Check if the current step matches the saving frequency
        if self.n_calls % self.steps == 0:
            # Save model with the current timestep in the filename
            if self.verbose > 0:
                print(f"Saving model at step {self.n_calls} to {self.save_path}")
            self.model.save(self.save_path)
        return True
    
callbacks = [SaveOnStep(2.5e4, path)]


## Create Model

In [None]:
# /!\ #
model = PPO(
    policy,
    env,
    
    verbose=0,
    tensorboard_log=tensorboard_log,
    device=device
)
#model.save(path)



## Load model

In [5]:
model = PPO.load(path,env)

## Learn

In [None]:
total_timesteps = 1e6

model.learn(
    total_timesteps, 
    reset_num_timesteps=False, 
    progress_bar=True, 
    callback=callbacks)
model.save(path)

Output()

## Display

In [9]:
display_env = make_env(render_mode='human')

for e in range(5):
    obs,_ = display_env.reset()
    while True:
        action, _ = model.predict(obs)
        obs, reward, terminated, truncated, info = display_env.step(action)
        print(reward)

        if terminated or truncated:
            break

    

-1.0446032312037878
0.041497266007435085
-1.8223283255747518
-1.7709734556313304
-1.720281778214896
-1.5044669696287236
-2.285267035060897
-2.738058232487252
-2.2291638586222255
-2.3576439050285023
-2.618765691980308
-2.8712039513252305
-3.7106097968923994
-2.1653847202939653
0.5958279123570089
-0.47164299736040194
-1.4354436524605325
-2.8567749608890836
-1.9767114461783812
-1.8158563088740336
-1.5400008377149974
-2.211301189453308
-3.4821284438493567
-3.0211218440276832
-2.593023278586461
-4.164874978911184
-3.336766314624363
-2.43713463398894
-2.3800050235701065
-2.5528086128880974
-2.9075526468830732
-2.0773716871854075
-2.5487586207103172
-2.5730921353277463
-2.9280697174377845
-0.6347824692288868
-0.7829873482670853
-1.10282648077772
-0.16185891911461758
-0.1702833265416419
0.40975086202390565
0.38625200209198773
-0.04741262069872354
-0.5054310401844997
-0.3652153824967932
-0.7173576838309141
-2.3478954191203836
-0.6085460537398772
0.02777926751753279
-1.8483947329636758
-1.190002