# Bipedal walker
https://gymnasium.farama.org/environments/box2d/bipedal_walker/

## Setup

In [1]:
!pip install gymnasium[box2d]

Collecting box2d-py==2.3.5
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25ldone
Using legacy 'setup.py install' for box2d-py, since package 'wheel' is not installed.
Installing collected packages: box2d-py
  Running setup.py install for box2d-py ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mRunning setup.py install for box2d-py[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[43 lines of output][0m
  [31m   [0m Using setuptools (version 59.6.0).
  [31m   [0m running install
  [31m   [0m running build
  [31m   [0m running build_py
  [31m   [0m creating build
  [31m   [0m creating build/lib.linux-x86_64-3.10
  [31m   [0m creating build/lib.linux-x86_64-3.10/Box2D
  [31m   [0m copying library/Box2D/__init__.py -> build/lib.linux-x86_64-3.10/Box2D
  [31m   [0m copying library/Box2D/Box2D.py -> build/lib.linux-x86_64-3.10/Box2D
  [31m   [0m crea

In [13]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback
import numpy as np

name = 'ppo_walker_v3'
env_id = "BipedalWalker-v3"
n_envs = 4

policy = 'MlpPolicy'
tensorboard_log = f"./{name}/t_logs/"
path = f"./{name}/model/"
device = 'cuda'

class RewardShapedBipedalWalker(gym.Wrapper):
    def __init__(self, env):
        super(RewardShapedBipedalWalker, self).__init__(env)
        self.previous_x = None

    def reset(self, **kwargs):
        self.previous_x = None
        return self.env.reset(**kwargs)

    def step(self, action):
        obs, original_reward, done, trunc, info = self.env.step(action)
        
        # Forward progress reward
        # current_x = self.env.hull.position[0]  # x-position of the agent
        # forward_reward = (current_x - self.previous_x) if self.previous_x is not None else 0
        # self.previous_x = current_x

        # # Energy penalty
        # energy_penalty = np.sum(np.square(action)) * 0.001

        # # Stability penalty (penalize large torso angles)
        # angle_penalty = abs(self.env.hull.angle) * 0.5
        vertical_speed_reward = obs[0]

        # Combine the rewards
        shaped_reward = original_reward #+ vertical_speed_reward

        return obs, shaped_reward, done, trunc, info

def make_env(render_mode:str='rgb_array'):
    e = gym.make(env_id, hardcore=True, render_mode=render_mode)
    e = RewardShapedBipedalWalker(e)
    return e

env = make_vec_env(make_env, n_envs)


class SaveOnStep(BaseCallback):
    def __init__(self, steps: int, path: str, verbose: int = 0):
        super().__init__(verbose)
        self.steps = steps
        self.save_path = path

    def _on_step(self) -> bool:
        # Check if the current step matches the saving frequency
        if self.n_calls % self.steps == 0:
            # Save model with the current timestep in the filename
            if self.verbose > 0:
                print(f"Saving model at step {self.n_calls} to {self.save_path}")
            self.model.save(self.save_path)
        return True
    
callbacks = [SaveOnStep(2.5e4, path)]


## Create Model

In [None]:
# /!\ #
model = PPO(
    policy,
    env,
    
    verbose=0,
    tensorboard_log=tensorboard_log,
    device=device
)
#model.save(path)



## Load model

In [5]:
model = PPO.load(path,env)

## Learn

In [None]:
total_timesteps = 1e6

model.learn(
    total_timesteps, 
    reset_num_timesteps=False, 
    progress_bar=True, 
    callback=callbacks)
model.save(path)

Output()

: 

## Display

In [11]:
display_env = make_env(render_mode='human')

for e in range(1):
    obs,_ = display_env.reset()
    while True:
        action, _ = model.predict(obs)
        obs, reward, terminated, truncated, info = display_env.step(action)
        print(reward)

        if terminated or truncated:
            break

    

-0.11080867887288331
-0.08414771546858171
-0.03816112547616282
-0.11724234543988865
-0.15909324418008328
-0.16245478768150132
-0.11504991040627281
-0.10637466110785684
-0.08440582470099249
-0.05071041524410248
0.007727079570293428
-0.051655106325943084
-0.004227459132671356
-0.11978585837284843
-0.06291726708412171
-0.11323752441008769
-0.0743443939934186
-0.04194562792778015
-0.019062044729787914
-0.16614839766423026
-0.10193519902229309
-0.12555121892690657
-0.11758070437113208
-0.09089587525526802
-0.08357864584525308
0.024321189820766444
0.09212541230519493
0.11851502517859258
0.19700703126192093
0.1398808769186326
0.20763639522592348
0.19149699340263765
0.16021005270878594
0.12769908739129818
0.10009908878803253
0.11696047325928886
0.19470660881201543
0.1536701471606867
0.15602529990673064
0.1798679574926718
0.30736755760510637
0.3603620199710131
0.22548343976338583
0.16807599027951795
0.2547438325285912
0.24333038304249213
0.24682531764110166
0.27829667162895205
0.291278967122234