In [None]:
!pip install gymnasium gymnasium[mujoco]
!pip install 'shimmy>=0.2.1'
!pip install gymnasium
!pip install -- upgrade stable - baselines3
!pip install mujoco
!pip install stable-baselines3

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting mujoco>=2.3.3 (from gymnasium)
  Downloading mujoco-3.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.3.3->gymnasium)
  Downloading glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: glfw, farama-notifications, gymnasium, mujoco
Successfully installed farama-notifications-0.0.4 

In [None]:
!pip install stable-baselines3 gym tensorboard
!apt-get install swig
!pip install box2d
!pip install gym[box2d]
!pip install optuna

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 45 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (1,371 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 121925 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubu

In [None]:
#Model 2: off-policy SAC model
import time
import numpy as np
import gym
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

def create_bipedalwalker_env(seed=None):
    env = make_vec_env('BipedalWalker-v3', n_envs=1, seed=seed)
    env = VecNormalize(env, norm_obs=True, norm_reward=True)
    return env

if __name__ == "__main__":
    seed = 0
    set_random_seed(seed)

    print(f"Training SAC model with seed: {seed}")

    #Create the environment
    env = create_bipedalwalker_env(seed)

    #Saving
    env.save(f'BipedalWalker-v3_vecnormalize_seed_{seed}.pkl')

    #Train the agent
    model = SAC("MlpPolicy", env, verbose=1, learning_rate=3e-4, gamma=0.99, buffer_size=1000000, batch_size=256,
                tau=0.005, train_freq=1, gradient_steps=1, tensorboard_log="./tb_logs/", seed=seed)

    #Evaluate if the model reaches 300
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=300, verbose=1)
    eval_callback = EvalCallback(env, callback_on_new_best=callback_on_best, eval_freq=10000,
                                 best_model_save_path='./logs/', log_path='./logs/', deterministic=True, render=False)

    #Time the model
    start_time = time.time()

    #Training
    total_timesteps = 2000000
    model.learn(total_timesteps=total_timesteps, callback=eval_callback, tb_log_name="sac_bipedalwalker")

    #End timing when done
    end_time = time.time()

    #Calculate time
    training_time = end_time - start_time

    #Save the model
    model.save(f'sac_bipedalwalker_seed_{seed}')

    #Save VecNormalize statistics
    env.save(f'BipedalWalker-v3_vecnormalize_seed_{seed}.pkl')

    #Print the results
    best_score = eval_callback.best_mean_reward
    if best_score < 300:
        print(f"Best score achieved: {best_score}")

    print(f"Completed training SAC model for seed: {seed}")
    print(f"Training session for SAC took {training_time:.2f} seconds")

    #Test model on multiple seeds
    test_seeds = [1, 2, 3, 4, 5]
    test_results = []

    for test_seed in test_seeds:
        set_random_seed(test_seed)
        test_env = create_bipedalwalker_env(test_seed)
        obs = test_env.reset()
        total_reward = 0
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = test_env.step(action)
            total_reward += reward
        test_results.append(total_reward)
        print(f"Test seed: {test_seed}, Total reward: {total_reward}")

    print(f"Average reward across test seeds: {np.mean(test_results)}, Std Dev: {np.std(test_results)}")

Training SAC model with seed: 0
Using cuda device
Logging to ./tb_logs/sac_bipedalwalker_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 462      |
|    ep_rew_mean     | -104     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 81       |
|    time_elapsed    | 22       |
|    total_timesteps | 1850     |
| train/             |          |
|    actor_loss      | -16.9    |
|    critic_loss     | 0.104    |
|    ent_coef        | 0.592    |
|    ent_coef_loss   | -3.47    |
|    learning_rate   | 0.0003   |
|    n_updates       | 1749     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 458      |
|    ep_rew_mean     | -107     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 78       |
|    time_elapsed    | 46       |
|    total_timesteps | 3664     |
| train/             |   