# Bipedal walker
https://gymnasium.farama.org/environments/box2d/bipedal_walker/

## Setup

In [2]:
!pip install gymnasium[mujoco]

Collecting mujoco>=2.1.5 (from gymnasium[mujoco])
  Downloading mujoco-3.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
Collecting imageio>=2.14.1 (from gymnasium[mujoco])
  Downloading imageio-2.36.0-py3-none-any.whl.metadata (5.2 kB)
Collecting etils[epath] (from mujoco>=2.1.5->gymnasium[mujoco])
  Downloading etils-1.10.0-py3-none-any.whl.metadata (6.5 kB)
Collecting glfw (from mujoco>=2.1.5->gymnasium[mujoco])
  Downloading glfw-2.8.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.p39.p310.p311.p312.p313-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting pyopengl (from mujoco>=2.1.5->gymnasium[mujoco])
  Downloading PyOpenGL-3.1.7-py3-none-any.whl.metadata (3.2 kB)
Collecting importlib_resources (from etils[epath]->mujoco>=2.1.5->gymnasium[mujoco])
  Downloading importlib_resources-6.4.5-py3-none-any.whl.metadata (4.0 kB)
Collecting zipp (from etils[epath]->mujoco>=2.1.5->gymnasium[mujoco])
  Downloading zipp-3.21.0-py3-none-any.

In [2]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback
import numpy as np

name = 'ppo_ant_v1'
env_id = "Ant-v5"
n_envs = 4

policy = 'MlpPolicy'
tensorboard_log = f"./{name}/t_logs/"
path = f"./{name}/model/"
device = 'cpu'

class RewardShapedBipedalWalker(gym.Wrapper):
    def __init__(self, env):
        super(RewardShapedBipedalWalker, self).__init__(env)
        self.previous_x = None

    def reset(self, **kwargs):
        self.previous_x = None
        return self.env.reset(**kwargs)

    def step(self, action):
        obs, original_reward, done, trunc, info = self.env.step(action)
        
        # Forward progress reward
        # current_x = self.env.hull.position[0]  # x-position of the agent
        # forward_reward = (current_x - self.previous_x) if self.previous_x is not None else 0
        # self.previous_x = current_x

        # # Energy penalty
        # energy_penalty = np.sum(np.square(action)) * 0.001

        # # Stability penalty (penalize large torso angles)
        # angle_penalty = abs(self.env.hull.angle) * 0.5
        vertical_speed_reward = obs[0]

        # Combine the rewards
        shaped_reward = original_reward #+ vertical_speed_reward

        return obs, shaped_reward, done, trunc, info

def make_env(render_mode:str='rgb_array'):
    e = gym.make(env_id, render_mode=render_mode)
    e = RewardShapedBipedalWalker(e)
    return e

env = make_vec_env(make_env, n_envs)


class SaveOnStep(BaseCallback):
    def __init__(self, steps: int, path: str, verbose: int = 0):
        super().__init__(verbose)
        self.steps = steps
        self.save_path = path

    def _on_step(self) -> bool:
        # Check if the current step matches the saving frequency
        if self.n_calls % self.steps == 0:
            # Save model with the current timestep in the filename
            if self.verbose > 0:
                print(f"Saving model at step {self.n_calls} to {self.save_path}")
            self.model.save(self.save_path)
        return True
    
callbacks = [SaveOnStep(2.5e4, path)]


## Create Model

In [None]:
# /!\ #
model = PPO(
    policy,
    env,
    
    verbose=0,
    tensorboard_log=tensorboard_log,
    device=device
)
#model.save(path)



## Load model

In [3]:
model = PPO.load(path,env)

## Learn

In [4]:
total_timesteps = 1e6

model.learn(
    total_timesteps, 
    reset_num_timesteps=False, 
    progress_bar=True, 
    callback=callbacks)
model.save(path)

Output()

## Display

In [None]:
display_env = make_env(render_mode='human')

for e in range(5):
    obs,_ = display_env.reset()
    while True:
        action, _ = model.predict(obs)
        obs, reward, terminated, truncated, info = display_env.step(action)
        print(reward)

        if terminated or truncated:
            break

    

-1.7329154459046
-0.2355303654163421
0.13239908416479085
-0.0973789706503716
-0.6211933243348622
-0.7251862514017517
-0.9763728559047288
-0.9535525187084725
-0.5143309159681508
-0.9102168868996261
0.018268563449697295
0.6151968259296752
-0.19612244967238146
-0.17135691403218534
0.5651280012144285
1.1453071575847436
1.2082266723728563
1.6300279345325621
1.7561831985104748
1.3346608348211457
1.819679168181096
1.6253447432274255
1.9354857186288545
2.0208884285627136
1.5640355993108521
1.8573325621988062
2.006704248072495
2.140585370359085
2.381689919471783
1.82944765437521
1.6582334995892936
1.5321117356101541
1.2731956074016857
1.1302636904763943
-0.10140993304030443
1.5540325210531036
1.169342927035013
0.32940275696834265
0.28339343528270877
0.4122375417177233
-0.048575577863179564
0.415204109076051
1.7067357560994005
1.8612308726091173
1.3162874101404967
0.7496818928963105
0.9817734656966195
1.1346420542722928
1.1887746221513427
1.261235160862861
0.732614370634374
-0.19460078227890953


: 