<a href="https://colab.research.google.com/github/Chpppy/Colab/blob/main/notebooks/LunarLander.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gymnasium
!pip install stable_baselines3
!pip install sb3-contrib
!pip install optuna

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/958.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m481.3/958.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0
Collecting stable_baselines3
  Downloading stable_baselines3-2.5.0-py3-none-any.whl.metadata (4.8 kB)
Downloading stable_baselines3-2.5.0-py3-none-any.w

In [2]:
!pip install swig
!pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.0
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379449 sha256=6d7aec7897bbd5d3917c88bc468461e4d737468414eff24838f571bda1031e72
  Stored in directory: /root/.cache/pip/wheels/

In [3]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [37]:
# config
N_TRIALS = 100  # Maximum number of trials
N_JOBS = 1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 2  # Number of evaluations during the training
N_TIMESTEPS = int(2e4)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 60)  # 15 minutes

ENV_ID = "LunarLander-v3"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}

In [21]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

In [38]:
from typing import Any, Dict
import torch
import torch.nn as nn

def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for PPO hyperparameters.

    :param trial: Optuna trial object
    :return: The sampled hyperparameters for the given trial.
    """
    # Discount factor between 0.9 and 0.9999
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    # 8, 16, 32, ... 1024
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 9, 10)

    ### YOUR CODE HERE
    # TODO:
    # - define the learning rate search space [1e-5, 1] (log) -> `suggest_float`
    # - define the network architecture search space ["tiny", "small"] -> `suggest_categorical`
    # - define the activation function search space ["tanh", "relu"]
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    net_arch = trial.suggest_categorical("net_arch", ["tiny", "small"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    ### END OF YOUR CODE

    # Display true values
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("n_steps", n_steps)

    if net_arch == "tiny":
        net_arch = {"pi": [64], "vf": [64]}
    else:
        net_arch = {"pi": [64, 64], "vf": [64, 64]}


    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "max_grad_norm": max_grad_norm,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
        },
    }

In [39]:
from stable_baselines3.common.callbacks import EvalCallback

class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.

    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [40]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()
    ### YOUR CODE HERE
    # TODO:
    # 1. Sample hyperparameters and update the default keyword arguments: `kwargs.update(other_params)`
    # 2. Create the evaluation envs
    # 3. Create the `TrialEvalCallback`

    # 1. Sample hyperparameters and update the keyword arguments
    kwargs.update(sample_ppo_params(trial))
    # Create the RL model
    model = PPO(**kwargs, device="cpu")

    # 2. Create envs used for evaluation using `make_vec_env`, `ENV_ID` and `N_EVAL_ENVS`
    env = make_vec_env(env_id=ENV_ID, n_envs=N_EVAL_ENVS)
    # 3. Create the `TrialEvalCallback` callback defined above that will periodically evaluate
    # and report the performance using `N_EVAL_EPISODES` every `EVAL_FREQ`
    # TrialEvalCallback signature:
    # TrialEvalCallback(eval_env, trial, n_eval_episodes, eval_freq, deterministic, verbose)
    eval_callback = TrialEvalCallback(env, trial, N_EVAL_EPISODES, EVAL_FREQ, True, 1)

    ### END OF YOUR CODE

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        env.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [41]:
import torch as th

# Set pytorch num threads to 1 for faster training
th.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_ppo_cartpole.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

[I 2025-02-24 08:54:09,772] A new study created in memory with name: no-name-5756be1a-f8bd-444b-a78f-3d70f9f244a2


Eval num_timesteps=10000, episode_reward=-233.81 +/- 114.16
Episode length: 136.40 +/- 23.50
New best mean reward!
Eval num_timesteps=20000, episode_reward=-164.40 +/- 53.72
Episode length: 687.80 +/- 173.60
New best mean reward!


[I 2025-02-24 08:55:11,449] Trial 0 finished with value: -164.3962309 and parameters: {'gamma': 0.000744730179195844, 'max_grad_norm': 2.6637881242995936, 'exponent_n_steps': 10, 'learning_rate': 0.001371402498309818, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: -164.3962309.


Eval num_timesteps=10000, episode_reward=-573.04 +/- 145.36
Episode length: 69.10 +/- 8.61
New best mean reward!
Eval num_timesteps=20000, episode_reward=-620.98 +/- 189.79
Episode length: 73.00 +/- 12.82


[I 2025-02-24 08:55:47,626] Trial 1 finished with value: -620.9750089 and parameters: {'gamma': 0.013998957413102604, 'max_grad_norm': 0.38078976749173177, 'exponent_n_steps': 9, 'learning_rate': 0.5689997431746266, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: -164.3962309.


Eval num_timesteps=10000, episode_reward=-717.64 +/- 290.08
Episode length: 220.00 +/- 31.12
New best mean reward!
Eval num_timesteps=20000, episode_reward=-719.41 +/- 99.40
Episode length: 419.50 +/- 58.95


[I 2025-02-24 08:56:31,724] Trial 2 finished with value: -719.4091194 and parameters: {'gamma': 0.09874241549431143, 'max_grad_norm': 2.3843290080941846, 'exponent_n_steps': 10, 'learning_rate': 0.0006494063071454294, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: -164.3962309.


Eval num_timesteps=10000, episode_reward=-1082.23 +/- 654.99
Episode length: 150.70 +/- 77.73
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1079.85 +/- 945.32
Episode length: 143.90 +/- 87.79
New best mean reward!


[I 2025-02-24 08:57:14,043] Trial 3 finished with value: -1079.854274 and parameters: {'gamma': 0.06966806981339066, 'max_grad_norm': 0.3335261115039163, 'exponent_n_steps': 10, 'learning_rate': 0.00012391494181636037, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 0 with value: -164.3962309.


Eval num_timesteps=10000, episode_reward=-187.33 +/- 103.93
Episode length: 71.80 +/- 14.08
New best mean reward!
Eval num_timesteps=20000, episode_reward=-236.87 +/- 129.80
Episode length: 74.30 +/- 10.23


[I 2025-02-24 08:57:47,884] Trial 4 finished with value: -236.8713959 and parameters: {'gamma': 0.0002990749417515859, 'max_grad_norm': 3.741980229038252, 'exponent_n_steps': 10, 'learning_rate': 0.030720590843176118, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: -164.3962309.
[I 2025-02-24 08:58:07,903] Trial 5 pruned. 


Eval num_timesteps=10000, episode_reward=-996.50 +/- 704.72
Episode length: 139.00 +/- 59.24
New best mean reward!
Eval num_timesteps=10000, episode_reward=-199.73 +/- 67.25
Episode length: 206.20 +/- 83.89
New best mean reward!
Eval num_timesteps=20000, episode_reward=-510.56 +/- 111.23
Episode length: 97.20 +/- 34.84


[I 2025-02-24 08:58:52,959] Trial 6 finished with value: -510.5634592 and parameters: {'gamma': 0.0019651752094409385, 'max_grad_norm': 4.989548695724667, 'exponent_n_steps': 10, 'learning_rate': 0.01263724271973843, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: -164.3962309.


Eval num_timesteps=10000, episode_reward=-380.70 +/- 567.52
Episode length: 341.50 +/- 197.86
New best mean reward!
Eval num_timesteps=20000, episode_reward=68.75 +/- 119.86
Episode length: 641.10 +/- 126.07
New best mean reward!


[I 2025-02-24 09:00:01,877] Trial 7 finished with value: 68.74624159999999 and parameters: {'gamma': 0.0001015392328807807, 'max_grad_norm': 1.4345497460836967, 'exponent_n_steps': 9, 'learning_rate': 0.001632956204180654, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 7 with value: 68.74624159999999.
[I 2025-02-24 09:00:21,243] Trial 8 pruned. 


Eval num_timesteps=10000, episode_reward=-597.27 +/- 158.84
Episode length: 65.30 +/- 8.12
New best mean reward!


[I 2025-02-24 09:00:42,001] Trial 9 pruned. 


Eval num_timesteps=10000, episode_reward=-1208.60 +/- 1440.03
Episode length: 163.40 +/- 103.95
New best mean reward!
Eval num_timesteps=10000, episode_reward=-137.71 +/- 18.04
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=5.21 +/- 23.79
Episode length: 1000.00 +/- 0.00
New best mean reward!


[I 2025-02-24 09:02:10,112] Trial 10 finished with value: 5.2075553 and parameters: {'gamma': 0.00012216001865418427, 'max_grad_norm': 0.6299827124232847, 'exponent_n_steps': 9, 'learning_rate': 0.01816290609152004, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 7 with value: 68.74624159999999.


Eval num_timesteps=10000, episode_reward=-72.76 +/- 132.46
Episode length: 641.00 +/- 288.94
New best mean reward!
Eval num_timesteps=20000, episode_reward=-78.73 +/- 120.27
Episode length: 576.60 +/- 197.10


[I 2025-02-24 09:03:18,004] Trial 11 finished with value: -78.72605089999999 and parameters: {'gamma': 0.00011075007939486335, 'max_grad_norm': 0.6209515040345722, 'exponent_n_steps': 9, 'learning_rate': 0.008144493787027416, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 7 with value: 68.74624159999999.


Eval num_timesteps=10000, episode_reward=-142.54 +/- 52.04
Episode length: 71.00 +/- 10.06
New best mean reward!
Eval num_timesteps=20000, episode_reward=-159.10 +/- 48.84
Episode length: 63.60 +/- 10.36


[I 2025-02-24 09:03:51,564] Trial 12 finished with value: -159.1001789 and parameters: {'gamma': 0.0011682460570618548, 'max_grad_norm': 1.4287207208997943, 'exponent_n_steps': 9, 'learning_rate': 0.04891700337416687, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 7 with value: 68.74624159999999.


Eval num_timesteps=10000, episode_reward=-183.03 +/- 68.22
Episode length: 291.40 +/- 97.41
New best mean reward!
Eval num_timesteps=20000, episode_reward=32.19 +/- 148.44
Episode length: 303.70 +/- 96.35
New best mean reward!


[I 2025-02-24 09:04:40,007] Trial 13 finished with value: 32.193333900000006 and parameters: {'gamma': 0.00024455908145303815, 'max_grad_norm': 0.5840230534191635, 'exponent_n_steps': 9, 'learning_rate': 0.0028286990985151323, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 7 with value: 68.74624159999999.
[I 2025-02-24 09:05:05,220] Trial 14 pruned. 


Eval num_timesteps=10000, episode_reward=-337.29 +/- 47.47
Episode length: 333.60 +/- 140.73
New best mean reward!


[I 2025-02-24 09:05:22,032] Trial 15 pruned. 


Eval num_timesteps=10000, episode_reward=-453.05 +/- 188.87
Episode length: 142.80 +/- 53.57
New best mean reward!
Eval num_timesteps=10000, episode_reward=-104.56 +/- 25.88
Episode length: 245.90 +/- 203.33
New best mean reward!
Eval num_timesteps=20000, episode_reward=-182.21 +/- 29.47
Episode length: 641.80 +/- 117.35


[I 2025-02-24 09:06:21,051] Trial 16 finished with value: -182.20978399999998 and parameters: {'gamma': 0.00053556227876498, 'max_grad_norm': 0.9032611029072047, 'exponent_n_steps': 9, 'learning_rate': 0.0038831163570780683, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 7 with value: 68.74624159999999.
[I 2025-02-24 09:06:39,724] Trial 17 pruned. 


Eval num_timesteps=10000, episode_reward=-522.20 +/- 155.01
Episode length: 61.80 +/- 9.95
New best mean reward!


[I 2025-02-24 09:06:57,550] Trial 18 pruned. 


Eval num_timesteps=10000, episode_reward=-1175.68 +/- 880.31
Episode length: 162.60 +/- 72.17
New best mean reward!


[I 2025-02-24 09:07:18,132] Trial 19 pruned. 


Eval num_timesteps=10000, episode_reward=-681.96 +/- 350.79
Episode length: 120.30 +/- 45.43
New best mean reward!


[I 2025-02-24 09:07:38,507] Trial 20 pruned. 


Eval num_timesteps=10000, episode_reward=-2034.66 +/- 593.69
Episode length: 408.90 +/- 66.49
New best mean reward!


[I 2025-02-24 09:08:04,288] Trial 21 pruned. 


Eval num_timesteps=10000, episode_reward=-194.04 +/- 77.83
Episode length: 381.90 +/- 287.68
New best mean reward!


[I 2025-02-24 09:08:33,476] Trial 22 pruned. 


Eval num_timesteps=10000, episode_reward=-241.80 +/- 38.22
Episode length: 566.40 +/- 206.42
New best mean reward!
Eval num_timesteps=10000, episode_reward=-127.66 +/- 15.19
Episode length: 69.80 +/- 11.29
New best mean reward!
Eval num_timesteps=20000, episode_reward=-119.69 +/- 46.41
Episode length: 73.70 +/- 15.09
New best mean reward!


[I 2025-02-24 09:09:07,364] Trial 23 finished with value: -119.69151190000002 and parameters: {'gamma': 0.00010954228261965336, 'max_grad_norm': 1.212662591796925, 'exponent_n_steps': 9, 'learning_rate': 0.11190761327284113, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 7 with value: 68.74624159999999.


Eval num_timesteps=10000, episode_reward=-167.61 +/- 26.21
Episode length: 367.60 +/- 82.06
New best mean reward!
Eval num_timesteps=20000, episode_reward=51.01 +/- 146.69
Episode length: 339.10 +/- 52.57
New best mean reward!


[I 2025-02-24 09:10:06,075] Trial 24 finished with value: 51.0077633 and parameters: {'gamma': 0.00038542582586045826, 'max_grad_norm': 0.5095886019626862, 'exponent_n_steps': 9, 'learning_rate': 0.002481673734859857, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 7 with value: 68.74624159999999.
[I 2025-02-24 09:10:28,194] Trial 25 pruned. 


Eval num_timesteps=10000, episode_reward=-379.27 +/- 35.27
Episode length: 392.80 +/- 104.01
New best mean reward!


[I 2025-02-24 09:10:48,285] Trial 26 pruned. 


Eval num_timesteps=10000, episode_reward=-553.79 +/- 231.80
Episode length: 132.90 +/- 27.23
New best mean reward!
Eval num_timesteps=10000, episode_reward=-140.40 +/- 19.87
Episode length: 266.20 +/- 90.38
New best mean reward!
Eval num_timesteps=20000, episode_reward=-225.01 +/- 60.00
Episode length: 344.90 +/- 258.64


[I 2025-02-24 09:11:40,482] Trial 27 finished with value: -225.0056054 and parameters: {'gamma': 0.0002121778570645785, 'max_grad_norm': 1.1016878412880655, 'exponent_n_steps': 9, 'learning_rate': 0.004768458787904804, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 7 with value: 68.74624159999999.
[I 2025-02-24 09:11:59,377] Trial 28 pruned. 


Eval num_timesteps=10000, episode_reward=-416.66 +/- 146.26
Episode length: 111.10 +/- 23.84
New best mean reward!


[I 2025-02-24 09:12:22,423] Trial 29 pruned. 


Eval num_timesteps=10000, episode_reward=-860.43 +/- 121.95
Episode length: 365.20 +/- 61.77
New best mean reward!


[I 2025-02-24 09:12:41,241] Trial 30 pruned. 


Eval num_timesteps=10000, episode_reward=-941.53 +/- 280.63
Episode length: 229.00 +/- 108.03
New best mean reward!


[I 2025-02-24 09:13:01,259] Trial 31 pruned. 


Eval num_timesteps=10000, episode_reward=-188.46 +/- 48.61
Episode length: 254.50 +/- 160.62
New best mean reward!


[I 2025-02-24 09:13:27,674] Trial 32 pruned. 


Eval num_timesteps=10000, episode_reward=-186.15 +/- 70.49
Episode length: 431.00 +/- 245.98
New best mean reward!
Eval num_timesteps=10000, episode_reward=-26.88 +/- 120.95
Episode length: 375.60 +/- 134.07
New best mean reward!
Eval num_timesteps=20000, episode_reward=71.22 +/- 117.42
Episode length: 374.70 +/- 279.15
New best mean reward!


[I 2025-02-24 09:14:18,451] Trial 33 finished with value: 71.2168073 and parameters: {'gamma': 0.00015930074953660867, 'max_grad_norm': 0.7434341114612586, 'exponent_n_steps': 9, 'learning_rate': 0.015460089685740275, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 33 with value: 71.2168073.
[I 2025-02-24 09:14:34,205] Trial 34 pruned. 


Eval num_timesteps=10000, episode_reward=-504.44 +/- 110.28
Episode length: 64.10 +/- 11.61
New best mean reward!


[I 2025-02-24 09:14:51,977] Trial 35 pruned. 


Eval num_timesteps=10000, episode_reward=-633.91 +/- 292.68
Episode length: 117.70 +/- 30.58
New best mean reward!


[I 2025-02-24 09:15:09,785] Trial 36 pruned. 


Eval num_timesteps=10000, episode_reward=-1096.87 +/- 691.35
Episode length: 146.70 +/- 68.31
New best mean reward!


[I 2025-02-24 09:15:28,983] Trial 37 pruned. 


Eval num_timesteps=10000, episode_reward=-805.93 +/- 518.14
Episode length: 185.20 +/- 60.46
New best mean reward!


[I 2025-02-24 09:15:50,926] Trial 38 pruned. 


Eval num_timesteps=10000, episode_reward=-345.54 +/- 115.47
Episode length: 170.10 +/- 57.24
New best mean reward!
Eval num_timesteps=10000, episode_reward=-114.94 +/- 36.55
Episode length: 275.20 +/- 88.72
New best mean reward!
Eval num_timesteps=20000, episode_reward=-145.23 +/- 57.95
Episode length: 712.20 +/- 255.37


[I 2025-02-24 09:16:56,619] Trial 39 finished with value: -145.22774529999998 and parameters: {'gamma': 0.0001560555537176508, 'max_grad_norm': 1.430520934827267, 'exponent_n_steps': 9, 'learning_rate': 0.002361040519629351, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 33 with value: 71.2168073.
[I 2025-02-24 09:17:19,296] Trial 40 pruned. 


Eval num_timesteps=10000, episode_reward=-519.32 +/- 166.41
Episode length: 453.00 +/- 137.54
New best mean reward!


[I 2025-02-24 09:17:48,831] Trial 41 pruned. 


Eval num_timesteps=10000, episode_reward=-279.15 +/- 42.10
Episode length: 715.40 +/- 189.08
New best mean reward!
Eval num_timesteps=10000, episode_reward=-113.44 +/- 135.44
Episode length: 115.90 +/- 30.30
New best mean reward!
Eval num_timesteps=20000, episode_reward=-141.17 +/- 47.65
Episode length: 301.20 +/- 148.25


[I 2025-02-24 09:18:29,030] Trial 42 finished with value: -141.17487169999998 and parameters: {'gamma': 0.00013833484490335238, 'max_grad_norm': 0.5730127329162963, 'exponent_n_steps': 9, 'learning_rate': 0.023498435089824708, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 33 with value: 71.2168073.


Eval num_timesteps=10000, episode_reward=-31.92 +/- 71.15
Episode length: 386.20 +/- 269.63
New best mean reward!
Eval num_timesteps=20000, episode_reward=68.90 +/- 165.47
Episode length: 389.30 +/- 200.05
New best mean reward!


[I 2025-02-24 09:19:32,121] Trial 43 finished with value: 68.9018012 and parameters: {'gamma': 0.00027937000999767524, 'max_grad_norm': 2.8935374016817104, 'exponent_n_steps': 9, 'learning_rate': 0.00834030364062998, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 33 with value: 71.2168073.
[I 2025-02-24 09:20:00,226] Trial 44 pruned. 


Eval num_timesteps=10000, episode_reward=-243.54 +/- 63.58
Episode length: 415.80 +/- 204.71
New best mean reward!


[I 2025-02-24 09:20:21,049] Trial 45 pruned. 


Eval num_timesteps=10000, episode_reward=-292.64 +/- 56.10
Episode length: 278.90 +/- 109.37
New best mean reward!


[I 2025-02-24 09:20:41,216] Trial 46 pruned. 


Eval num_timesteps=10000, episode_reward=-1036.87 +/- 634.99
Episode length: 143.30 +/- 70.46
New best mean reward!


[I 2025-02-24 09:20:59,577] Trial 47 pruned. 


Eval num_timesteps=10000, episode_reward=-266.52 +/- 35.29
Episode length: 154.60 +/- 61.92
New best mean reward!


[I 2025-02-24 09:21:18,414] Trial 48 pruned. 


Eval num_timesteps=10000, episode_reward=-231.60 +/- 34.09
Episode length: 221.90 +/- 54.88
New best mean reward!


[I 2025-02-24 09:21:39,027] Trial 49 pruned. 


Eval num_timesteps=10000, episode_reward=-945.08 +/- 567.97
Episode length: 127.10 +/- 60.67
New best mean reward!


[I 2025-02-24 09:21:56,371] Trial 50 pruned. 


Eval num_timesteps=10000, episode_reward=-297.42 +/- 142.14
Episode length: 130.50 +/- 43.70
New best mean reward!


[I 2025-02-24 09:22:21,976] Trial 51 pruned. 


Eval num_timesteps=10000, episode_reward=-143.64 +/- 153.16
Episode length: 393.40 +/- 172.64
New best mean reward!
Eval num_timesteps=10000, episode_reward=-67.22 +/- 34.27
Episode length: 785.60 +/- 327.97
New best mean reward!
Eval num_timesteps=20000, episode_reward=-98.31 +/- 22.42
Episode length: 1000.00 +/- 0.00


[I 2025-02-24 09:23:43,180] Trial 52 finished with value: -98.30626409999999 and parameters: {'gamma': 0.00024157069977113148, 'max_grad_norm': 0.6000196170718943, 'exponent_n_steps': 9, 'learning_rate': 0.011400313473844204, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 33 with value: 71.2168073.
[I 2025-02-24 09:24:14,539] Trial 53 pruned. 


Eval num_timesteps=10000, episode_reward=-237.46 +/- 48.54
Episode length: 676.90 +/- 209.87
New best mean reward!


[I 2025-02-24 09:24:32,381] Trial 54 pruned. 


Eval num_timesteps=10000, episode_reward=-261.43 +/- 139.51
Episode length: 213.10 +/- 121.08
New best mean reward!


[I 2025-02-24 09:24:50,392] Trial 55 pruned. 


Eval num_timesteps=10000, episode_reward=-604.07 +/- 164.09
Episode length: 67.20 +/- 9.39
New best mean reward!


[I 2025-02-24 09:25:06,727] Trial 56 pruned. 


Eval num_timesteps=10000, episode_reward=-416.01 +/- 57.46
Episode length: 68.10 +/- 11.06
New best mean reward!


[I 2025-02-24 09:25:27,212] Trial 57 pruned. 


Eval num_timesteps=10000, episode_reward=-760.14 +/- 287.63
Episode length: 147.10 +/- 21.55
New best mean reward!


[I 2025-02-24 09:25:44,966] Trial 58 pruned. 


Eval num_timesteps=10000, episode_reward=-395.06 +/- 237.62
Episode length: 199.20 +/- 101.52
New best mean reward!
Eval num_timesteps=10000, episode_reward=-102.12 +/- 21.40
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=49.44 +/- 154.76
Episode length: 362.80 +/- 98.14
New best mean reward!


[I 2025-02-24 09:26:50,141] Trial 59 finished with value: 49.44472149999999 and parameters: {'gamma': 0.0002224615420247989, 'max_grad_norm': 0.30153638254046156, 'exponent_n_steps': 9, 'learning_rate': 0.005647088309158515, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 33 with value: 71.2168073.
[I 2025-02-24 09:27:35,578] Trial 60 pruned. 


Eval num_timesteps=10000, episode_reward=-158.19 +/- 28.49
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-61.20 +/- 53.75
Episode length: 916.60 +/- 250.20
New best mean reward!
Eval num_timesteps=20000, episode_reward=123.36 +/- 54.66
Episode length: 872.00 +/- 215.96
New best mean reward!


[I 2025-02-24 09:28:59,587] Trial 61 finished with value: 123.3634229 and parameters: {'gamma': 0.00022750556994400318, 'max_grad_norm': 0.30132099146419916, 'exponent_n_steps': 9, 'learning_rate': 0.0017814207192177304, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 61 with value: 123.3634229.


Eval num_timesteps=10000, episode_reward=-77.22 +/- 26.79
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-74.75 +/- 79.09
Episode length: 420.80 +/- 157.65
New best mean reward!


[I 2025-02-24 09:30:22,339] Trial 62 finished with value: -74.7539611 and parameters: {'gamma': 0.0002451893310930031, 'max_grad_norm': 0.3085971231112136, 'exponent_n_steps': 9, 'learning_rate': 0.0019459184585463043, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 61 with value: 123.3634229.


Eval num_timesteps=10000, episode_reward=-68.52 +/- 16.46
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=171.74 +/- 73.93
Episode length: 412.00 +/- 104.42
New best mean reward!


[I 2025-02-24 09:31:30,545] Trial 63 finished with value: 171.7401334 and parameters: {'gamma': 0.00017308880670360112, 'max_grad_norm': 0.35278706193345905, 'exponent_n_steps': 9, 'learning_rate': 0.0029920996171534323, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 63 with value: 171.7401334.


Eval num_timesteps=10000, episode_reward=-66.30 +/- 21.22
Episode length: 1000.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=153.66 +/- 40.65
Episode length: 518.70 +/- 173.26
New best mean reward!


[I 2025-02-24 09:32:53,894] Trial 64 finished with value: 153.66170960000002 and parameters: {'gamma': 0.00018255936363708585, 'max_grad_norm': 0.3614459029416312, 'exponent_n_steps': 9, 'learning_rate': 0.0006867286581962038, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 63 with value: 171.7401334.


Eval num_timesteps=10000, episode_reward=-101.88 +/- 158.75
Episode length: 905.30 +/- 197.29
New best mean reward!


[I 2025-02-24 09:34:09,603] Trial 65 pruned. 


Eval num_timesteps=20000, episode_reward=-183.96 +/- 76.61
Episode length: 848.80 +/- 199.90


[I 2025-02-24 09:34:27,983] Trial 66 pruned. 


Eval num_timesteps=10000, episode_reward=-129.24 +/- 50.28
Episode length: 69.50 +/- 7.77
New best mean reward!


[I 2025-02-24 09:34:49,908] Trial 67 pruned. 


Eval num_timesteps=10000, episode_reward=-1523.68 +/- 424.14
Episode length: 276.30 +/- 100.04
New best mean reward!


[I 2025-02-24 09:35:10,018] Trial 68 pruned. 


Eval num_timesteps=10000, episode_reward=-882.75 +/- 470.97
Episode length: 212.40 +/- 88.23
New best mean reward!


[I 2025-02-24 09:35:33,010] Trial 69 pruned. 


Eval num_timesteps=10000, episode_reward=-199.86 +/- 35.40
Episode length: 364.90 +/- 148.17
New best mean reward!


[I 2025-02-24 09:35:52,404] Trial 70 pruned. 


Eval num_timesteps=10000, episode_reward=-142.18 +/- 26.96
Episode length: 70.70 +/- 12.60
New best mean reward!
Eval num_timesteps=10000, episode_reward=-127.02 +/- 25.46
Episode length: 70.70 +/- 12.63
New best mean reward!


[I 2025-02-24 09:36:33,498] Trial 71 pruned. 


Eval num_timesteps=20000, episode_reward=-292.22 +/- 60.71
Episode length: 121.60 +/- 43.26
Eval num_timesteps=10000, episode_reward=-9.82 +/- 102.34
Episode length: 777.60 +/- 164.82
New best mean reward!
Eval num_timesteps=20000, episode_reward=104.47 +/- 104.48
Episode length: 524.50 +/- 214.45
New best mean reward!


[I 2025-02-24 09:37:50,385] Trial 72 finished with value: 104.4664837 and parameters: {'gamma': 0.00023239237110234427, 'max_grad_norm': 0.38065782933836734, 'exponent_n_steps': 9, 'learning_rate': 0.0014493137471918926, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 63 with value: 171.7401334.
[I 2025-02-24 09:38:19,542] Trial 73 pruned. 


Eval num_timesteps=10000, episode_reward=-209.51 +/- 92.95
Episode length: 568.70 +/- 263.07
New best mean reward!


[I 2025-02-24 09:38:44,872] Trial 74 pruned. 


Eval num_timesteps=10000, episode_reward=-1195.28 +/- 606.80
Episode length: 334.00 +/- 171.52
New best mean reward!


[I 2025-02-24 09:39:05,298] Trial 75 pruned. 


Eval num_timesteps=10000, episode_reward=-1173.11 +/- 650.18
Episode length: 166.10 +/- 68.67
New best mean reward!
Eval num_timesteps=10000, episode_reward=-69.95 +/- 28.41
Episode length: 197.70 +/- 38.10
New best mean reward!
Eval num_timesteps=20000, episode_reward=11.87 +/- 105.89
Episode length: 338.40 +/- 234.38
New best mean reward!


[I 2025-02-24 09:39:54,405] Trial 76 finished with value: 11.8655344 and parameters: {'gamma': 0.00016286308713733594, 'max_grad_norm': 0.3321146036645639, 'exponent_n_steps': 9, 'learning_rate': 0.0032676804068712917, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 63 with value: 171.7401334.
[I 2025-02-24 09:40:18,949] Trial 77 pruned. 


Eval num_timesteps=10000, episode_reward=-1725.95 +/- 685.70
Episode length: 432.80 +/- 49.68
New best mean reward!


[I 2025-02-24 09:40:37,650] Trial 78 pruned. 


Eval num_timesteps=10000, episode_reward=-127.72 +/- 28.16
Episode length: 88.70 +/- 41.64
New best mean reward!
Eval num_timesteps=10000, episode_reward=-37.27 +/- 152.49
Episode length: 219.60 +/- 62.70
New best mean reward!
Eval num_timesteps=20000, episode_reward=34.62 +/- 112.40
Episode length: 230.60 +/- 117.46
New best mean reward!


[I 2025-02-24 09:41:27,916] Trial 79 finished with value: 34.6213547 and parameters: {'gamma': 0.00019846471598402272, 'max_grad_norm': 1.5780864529232235, 'exponent_n_steps': 9, 'learning_rate': 0.004087373983683397, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 63 with value: 171.7401334.
[I 2025-02-24 09:41:56,800] Trial 80 pruned. 


Eval num_timesteps=10000, episode_reward=-143.44 +/- 75.79
Episode length: 334.10 +/- 308.45
New best mean reward!


[I 2025-02-24 09:42:20,350] Trial 81 pruned. 


Eval num_timesteps=10000, episode_reward=-128.63 +/- 22.53
Episode length: 313.10 +/- 135.62
New best mean reward!


[I 2025-02-24 09:42:46,102] Trial 82 pruned. 


Eval num_timesteps=10000, episode_reward=-150.31 +/- 17.53
Episode length: 522.80 +/- 68.77
New best mean reward!


[I 2025-02-24 09:43:25,424] Trial 83 pruned. 


Eval num_timesteps=10000, episode_reward=-127.17 +/- 61.73
Episode length: 922.00 +/- 116.82
New best mean reward!


[I 2025-02-24 09:43:45,989] Trial 84 pruned. 


Eval num_timesteps=10000, episode_reward=-461.17 +/- 54.19
Episode length: 191.70 +/- 40.20
New best mean reward!
Eval num_timesteps=10000, episode_reward=-95.84 +/- 129.34
Episode length: 601.40 +/- 196.07
New best mean reward!


[I 2025-02-24 09:44:50,274] Trial 85 pruned. 


Eval num_timesteps=20000, episode_reward=-165.96 +/- 14.77
Episode length: 360.20 +/- 91.97


[I 2025-02-24 09:45:26,717] Trial 86 pruned. 


Eval num_timesteps=10000, episode_reward=-240.89 +/- 102.19
Episode length: 641.60 +/- 256.39
New best mean reward!
Eval num_timesteps=10000, episode_reward=-114.06 +/- 22.53
Episode length: 272.10 +/- 75.48
New best mean reward!


[I 2025-02-24 09:46:10,384] Trial 87 pruned. 


Eval num_timesteps=20000, episode_reward=-287.71 +/- 53.60
Episode length: 171.20 +/- 29.76


[I 2025-02-24 09:46:53,134] Trial 88 pruned. 


Eval num_timesteps=10000, episode_reward=-172.86 +/- 13.06
Episode length: 1000.00 +/- 0.00
New best mean reward!


[I 2025-02-24 09:47:18,211] Trial 89 pruned. 


Eval num_timesteps=10000, episode_reward=-709.32 +/- 283.10
Episode length: 362.80 +/- 196.62
New best mean reward!


[I 2025-02-24 09:47:38,405] Trial 90 pruned. 


Eval num_timesteps=10000, episode_reward=-470.72 +/- 106.23
Episode length: 128.80 +/- 44.89
New best mean reward!


[I 2025-02-24 09:47:59,500] Trial 91 pruned. 


Eval num_timesteps=10000, episode_reward=-121.15 +/- 48.86
Episode length: 201.00 +/- 70.72
New best mean reward!


[I 2025-02-24 09:48:21,419] Trial 92 pruned. 


Eval num_timesteps=10000, episode_reward=-148.04 +/- 25.37
Episode length: 283.00 +/- 73.02
New best mean reward!


[I 2025-02-24 09:48:46,253] Trial 93 pruned. 


Eval num_timesteps=10000, episode_reward=-193.97 +/- 104.07
Episode length: 358.60 +/- 85.46
New best mean reward!
Eval num_timesteps=10000, episode_reward=-107.62 +/- 47.54
Episode length: 538.90 +/- 244.38
New best mean reward!
Eval num_timesteps=20000, episode_reward=86.52 +/- 112.81
Episode length: 601.70 +/- 209.53
New best mean reward!


[I 2025-02-24 09:49:58,443] Trial 94 finished with value: 86.5234044 and parameters: {'gamma': 0.00024644610863116726, 'max_grad_norm': 1.813546334015925, 'exponent_n_steps': 9, 'learning_rate': 0.0013359659036871613, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 63 with value: 171.7401334.
[I 2025-02-24 09:50:20,436] Trial 95 pruned. 


Eval num_timesteps=10000, episode_reward=-135.32 +/- 19.48
Episode length: 258.20 +/- 107.65
New best mean reward!


[I 2025-02-24 09:50:47,510] Trial 96 pruned. 


Eval num_timesteps=10000, episode_reward=-466.48 +/- 68.56
Episode length: 645.40 +/- 132.48
New best mean reward!


[I 2025-02-24 09:51:29,035] Trial 97 pruned. 


Eval num_timesteps=10000, episode_reward=-231.32 +/- 69.13
Episode length: 1000.00 +/- 0.00
New best mean reward!


[I 2025-02-24 09:52:05,442] Trial 98 pruned. 


Eval num_timesteps=10000, episode_reward=-453.63 +/- 101.27
Episode length: 937.60 +/- 130.08
New best mean reward!
Eval num_timesteps=10000, episode_reward=-55.76 +/- 100.98
Episode length: 927.30 +/- 148.23
New best mean reward!
Eval num_timesteps=20000, episode_reward=-153.97 +/- 39.95
Episode length: 258.70 +/- 106.76


[I 2025-02-24 09:53:06,894] Trial 99 finished with value: -153.9720122 and parameters: {'gamma': 0.0003490546304377089, 'max_grad_norm': 1.223917011760367, 'exponent_n_steps': 10, 'learning_rate': 0.0006083387429401111, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 63 with value: 171.7401334.


Number of finished trials:  100
Best trial:
  Value: 171.7401334
  Params: 
    gamma: 0.00017308880670360112
    max_grad_norm: 0.35278706193345905
    exponent_n_steps: 9
    learning_rate: 0.0029920996171534323
    net_arch: small
    activation_fn: relu
  User attrs:
    gamma_: 0.9998269111932964
    n_steps: 512


In [None]:

#env = gym.make("LunarLander-v3")
policy_kwargs={
            "net_arch": {"pi": [64, 64], "vf": [64, 64]},
            "activation_fn": nn.ReLU,
        }
env = make_vec_env("LunarLander-v3", n_envs=5)
model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=512,
    batch_size=64,
    n_epochs=4,
    gamma=0.9998269111932964,
    max_grad_norm= 0.35278706193345905,
    policy_kwargs=policy_kwargs,
    learning_rate= 0.0029920996171534323
)

model.learn(total_timesteps=300000)
# Save the model
model_name = "ppo-LunarLander-v3"
model.save(model_name)

In [None]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO

env = gym.make("LunarLander-v3", render_mode="human")
model = PPO.load("results/LunarLander/ppo-LunarLander-v3.zip")

obs, info = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        break
env.close()