In [None]:
!pip install gymnasium gymnasium[mujoco]
!pip install 'shimmy>=0.2.1'
!pip install gymnasium
!pip install -- upgrade stable - baselines3
!pip install mujoco
!pip install stable-baselines3
!pip install stable-baselines3 gym tensorboard
!apt-get install swig
!pip install box2d
!pip install gym[box2d]
!pip install optuna

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting mujoco>=2.3.3 (from gymnasium)
  Downloading mujoco-3.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco>=2.3.3->gymnasium)
  Downloading glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: glfw, farama-notifications, gymnasium, mujoco
Successfully installed farama-notifications-0.0.4 g

In [None]:
#Model 1.2: on-policy PPO model (with clipping .4)
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import time
import numpy as np

def create_bipedalwalker_env(seed=None):
    env = make_vec_env('BipedalWalker-v3', n_envs=1, seed=seed)
    env = VecNormalize(env, norm_obs=True, norm_reward=True)
    return env

if __name__ == "__main__":
    seed = 0
    set_random_seed(seed, using_cuda=True)

    print(f"Training PPO model with seed: {seed}")

    #Create the environment
    env = create_bipedalwalker_env(seed)

    #Saving
    env.save(f'BipedalWalker-v3_vecnormalize_seed_{seed}.pkl')

    #Train the agent
    model = PPO("MlpPolicy", env, verbose=1, learning_rate=3e-4, gamma=0.99, gae_lambda=0.95,
                ent_coef=0.01, n_steps=2048, batch_size=256, clip_range=0.4, tensorboard_log="./tb_logs/", seed=seed)

    #Evaluate if the model reaches 300
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=300, verbose=1)
    eval_callback = EvalCallback(env, callback_on_new_best=callback_on_best, eval_freq=10000, best_model_save_path='./logs/', log_path='./logs/', deterministic=True, render=False)

    #Time the model
    start_time = time.time()

    #Training
    total_timesteps = 2000000
    model.learn(total_timesteps=total_timesteps, callback=eval_callback, tb_log_name="ppo_bipedalwalker")

    #End timing when done
    end_time = time.time()

    #Calculate time
    training_time = end_time - start_time

    #Save the model
    model.save(f'ppo_bipedalwalker_seed_{seed}')

    #Print the results
    best_score = eval_callback.best_mean_reward
    if best_score < 300:
        print(f"Best score achieved: {best_score}")

    #Reset environment
    env.reset()

    env.training = False
    env.norm_reward = False

    print(f"Completed training PPO model for seed: {seed}")
    print(f"Total training time: {training_time:.2f} seconds")

    #Test model on multiple seeds
    test_seeds = [1, 2, 3, 4, 5]
    test_results = []

    for test_seed in test_seeds:
        set_random_seed(test_seed, using_cuda=True)
        test_env = create_bipedalwalker_env(test_seed)
        obs = test_env.reset()
        total_reward = 0
        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = test_env.step(action)
            total_reward += reward
        test_results.append(total_reward)
        print(f"Test seed: {test_seed}, Total reward: {total_reward}")

    print(f"Average reward across test seeds: {np.mean(test_results)}, Std Dev: {np.std(test_results)}")

[1;30;43mStreaminguitvoer ingekort tot de laatste 5000 regels.[0m
|    loss                 | -0.109     |
|    n_updates            | 7670       |
|    policy_gradient_loss | -0.0334    |
|    std                  | 1.63       |
|    value_loss           | 0.0416     |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 189         |
| time/                   |             |
|    fps                  | 393         |
|    iterations           | 769         |
|    time_elapsed         | 4002        |
|    total_timesteps      | 1574912     |
| train/                  |             |
|    approx_kl            | 0.033292167 |
|    clip_fraction        | 0.0823      |
|    clip_range           | 0.4         |
|    entropy_loss         | -7.48       |
|    explained_variance   | 0.392       |
|    learning_rate        | 0.0003      |
|    loss     