<a href="https://colab.research.google.com/github/Chpppy/Colab/blob/main/notebooks/PPO_optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gymnasium
!pip install stable_baselines3
!pip install sb3-contrib
!pip install optuna

Collecting stable_baselines3
  Downloading stable_baselines3-2.5.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (

In [2]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [3]:
# config
N_TRIALS = 50  # Maximum number of trials
N_JOBS = 1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 2  # Number of evaluations during the training
N_TIMESTEPS = int(2e4)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 60)  # 15 minutes

ENV_ID = "CartPole-v1"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}

In [4]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

In [5]:
from typing import Any, Dict
import torch
import torch.nn as nn

def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for PPO hyperparameters.

    :param trial: Optuna trial object
    :return: The sampled hyperparameters for the given trial.
    """
    # Discount factor between 0.9 and 0.9999
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    # 8, 16, 32, ... 1024
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 9, 10)

    ### YOUR CODE HERE
    # TODO:
    # - define the learning rate search space [1e-5, 1] (log) -> `suggest_float`
    # - define the network architecture search space ["tiny", "small"] -> `suggest_categorical`
    # - define the activation function search space ["tanh", "relu"]
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    net_arch = trial.suggest_categorical("net_arch", ["tiny", "small"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    ### END OF YOUR CODE

    # Display true values
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("n_steps", n_steps)

    if net_arch == "tiny":
        net_arch = {"pi": [64], "vf": [64]}
    else:
        net_arch = {"pi": [64, 64], "vf": [64, 64]}


    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "max_grad_norm": max_grad_norm,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
        },
    }

In [6]:
from stable_baselines3.common.callbacks import EvalCallback

class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.

    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [7]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()
    ### YOUR CODE HERE
    # TODO:
    # 1. Sample hyperparameters and update the default keyword arguments: `kwargs.update(other_params)`
    # 2. Create the evaluation envs
    # 3. Create the `TrialEvalCallback`

    # 1. Sample hyperparameters and update the keyword arguments
    kwargs.update(sample_ppo_params(trial))
    # Create the RL model
    model = PPO(**kwargs, device="cpu")

    # 2. Create envs used for evaluation using `make_vec_env`, `ENV_ID` and `N_EVAL_ENVS`
    env = make_vec_env(env_id=ENV_ID, n_envs=N_EVAL_ENVS)
    # 3. Create the `TrialEvalCallback` callback defined above that will periodically evaluate
    # and report the performance using `N_EVAL_EPISODES` every `EVAL_FREQ`
    # TrialEvalCallback signature:
    # TrialEvalCallback(eval_env, trial, n_eval_episodes, eval_freq, deterministic, verbose)
    eval_callback = TrialEvalCallback(env, trial, N_EVAL_EPISODES, EVAL_FREQ, True, 1)

    ### END OF YOUR CODE

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        env.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [9]:
import torch as th

# Set pytorch num threads to 1 for faster training
th.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_ppo_cartpole.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

[I 2025-02-27 10:44:02,651] A new study created in memory with name: no-name-c180cd17-d005-4225-889e-1fc42cf20197


Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2025-02-27 10:44:38,485] Trial 0 finished with value: 500.0 and parameters: {'gamma': 0.09037775362786567, 'max_grad_norm': 1.1227816917226559, 'exponent_n_steps': 10, 'learning_rate': 0.001995156512507026, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 0 with value: 500.0.


Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2025-02-27 10:45:13,621] Trial 1 finished with value: 500.0 and parameters: {'gamma': 0.00013543862665177275, 'max_grad_norm': 1.2732643016739815, 'exponent_n_steps': 10, 'learning_rate': 0.0007283282275177013, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 500.0.


Eval num_timesteps=10000, episode_reward=483.20 +/- 29.60
Episode length: 483.20 +/- 29.60
New best mean reward!
Eval num_timesteps=20000, episode_reward=466.80 +/- 58.90
Episode length: 466.80 +/- 58.90


[I 2025-02-27 10:45:49,362] Trial 2 finished with value: 466.8 and parameters: {'gamma': 0.040038946965970816, 'max_grad_norm': 0.3966798689155365, 'exponent_n_steps': 10, 'learning_rate': 0.002637290592675298, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 0 with value: 500.0.


Eval num_timesteps=10000, episode_reward=306.80 +/- 109.73
Episode length: 306.80 +/- 109.73
New best mean reward!
Eval num_timesteps=20000, episode_reward=409.50 +/- 102.65
Episode length: 409.50 +/- 102.65
New best mean reward!


[I 2025-02-27 10:46:24,830] Trial 3 finished with value: 409.5 and parameters: {'gamma': 0.06983781570497487, 'max_grad_norm': 1.9870744922852361, 'exponent_n_steps': 10, 'learning_rate': 0.0001537596574700772, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 0 with value: 500.0.


Eval num_timesteps=10000, episode_reward=423.70 +/- 79.19
Episode length: 423.70 +/- 79.19
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!


[I 2025-02-27 10:46:54,336] Trial 4 finished with value: 500.0 and parameters: {'gamma': 0.03562365426672077, 'max_grad_norm': 0.8011380152136394, 'exponent_n_steps': 9, 'learning_rate': 0.0039896965646794205, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 0 with value: 500.0.
[I 2025-02-27 10:47:10,971] Trial 5 pruned. 


Eval num_timesteps=10000, episode_reward=9.10 +/- 0.70
Episode length: 9.10 +/- 0.70
New best mean reward!


[I 2025-02-27 10:47:26,633] Trial 6 pruned. 


Eval num_timesteps=10000, episode_reward=56.90 +/- 20.34
Episode length: 56.90 +/- 20.34
New best mean reward!


[I 2025-02-27 10:47:42,905] Trial 7 pruned. 


Eval num_timesteps=10000, episode_reward=78.60 +/- 18.88
Episode length: 78.60 +/- 18.88
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2025-02-27 10:48:13,130] Trial 8 finished with value: 500.0 and parameters: {'gamma': 0.0003473366310130125, 'max_grad_norm': 0.7266255249902291, 'exponent_n_steps': 10, 'learning_rate': 0.030692502437995876, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: 500.0.
[I 2025-02-27 10:48:29,125] Trial 9 pruned. 


Eval num_timesteps=10000, episode_reward=64.00 +/- 18.37
Episode length: 64.00 +/- 18.37
New best mean reward!


[I 2025-02-27 10:48:43,233] Trial 10 pruned. 


Eval num_timesteps=10000, episode_reward=356.60 +/- 146.10
Episode length: 356.60 +/- 146.10
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2025-02-27 10:49:18,702] Trial 11 finished with value: 500.0 and parameters: {'gamma': 0.00010483855970848772, 'max_grad_norm': 1.1526540894294548, 'exponent_n_steps': 10, 'learning_rate': 0.0011419173392466797, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 500.0.
[I 2025-02-27 10:49:35,283] Trial 12 pruned. 


Eval num_timesteps=10000, episode_reward=258.80 +/- 90.68
Episode length: 258.80 +/- 90.68
New best mean reward!


[I 2025-02-27 10:49:52,317] Trial 13 pruned. 


Eval num_timesteps=10000, episode_reward=492.80 +/- 21.60
Episode length: 492.80 +/- 21.60
New best mean reward!


[I 2025-02-27 10:50:09,168] Trial 14 pruned. 


Eval num_timesteps=10000, episode_reward=399.10 +/- 89.80
Episode length: 399.10 +/- 89.80
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2025-02-27 10:50:44,645] Trial 15 finished with value: 500.0 and parameters: {'gamma': 0.00025935751707677576, 'max_grad_norm': 1.5806514736540742, 'exponent_n_steps': 10, 'learning_rate': 0.008421348194676914, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 500.0.
[I 2025-02-27 10:51:00,644] Trial 16 pruned. 


Eval num_timesteps=10000, episode_reward=9.10 +/- 0.70
Episode length: 9.10 +/- 0.70
New best mean reward!


[I 2025-02-27 10:51:17,222] Trial 17 pruned. 


Eval num_timesteps=10000, episode_reward=180.40 +/- 126.84
Episode length: 180.40 +/- 126.84
New best mean reward!


[I 2025-02-27 10:51:31,603] Trial 18 pruned. 


Eval num_timesteps=10000, episode_reward=351.50 +/- 102.75
Episode length: 351.50 +/- 102.75
New best mean reward!


[I 2025-02-27 10:51:48,364] Trial 19 pruned. 


Eval num_timesteps=10000, episode_reward=289.00 +/- 93.64
Episode length: 289.00 +/- 93.64
New best mean reward!


[I 2025-02-27 10:52:04,915] Trial 20 pruned. 


Eval num_timesteps=10000, episode_reward=321.90 +/- 102.97
Episode length: 321.90 +/- 102.97
New best mean reward!


[I 2025-02-27 10:52:19,279] Trial 21 pruned. 


Eval num_timesteps=10000, episode_reward=357.00 +/- 109.27
Episode length: 357.00 +/- 109.27
New best mean reward!


[I 2025-02-27 10:52:33,633] Trial 22 pruned. 


Eval num_timesteps=10000, episode_reward=453.50 +/- 73.39
Episode length: 453.50 +/- 73.39
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2025-02-27 10:53:04,366] Trial 23 finished with value: 500.0 and parameters: {'gamma': 0.02377348202969297, 'max_grad_norm': 0.9161465832493959, 'exponent_n_steps': 9, 'learning_rate': 0.005044756854066808, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 0 with value: 500.0.
[I 2025-02-27 10:53:19,252] Trial 24 pruned. 


Eval num_timesteps=10000, episode_reward=173.60 +/- 15.67
Episode length: 173.60 +/- 15.67
New best mean reward!


[I 2025-02-27 10:53:35,498] Trial 25 pruned. 


Eval num_timesteps=10000, episode_reward=327.00 +/- 121.02
Episode length: 327.00 +/- 121.02
New best mean reward!


[I 2025-02-27 10:53:54,841] Trial 26 pruned. 


Eval num_timesteps=10000, episode_reward=205.20 +/- 100.17
Episode length: 205.20 +/- 100.17
New best mean reward!


[I 2025-02-27 10:54:13,783] Trial 27 pruned. 


Eval num_timesteps=10000, episode_reward=88.80 +/- 39.04
Episode length: 88.80 +/- 39.04
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2025-02-27 10:54:48,341] Trial 28 finished with value: 500.0 and parameters: {'gamma': 0.00018070959291255456, 'max_grad_norm': 1.9607882984530636, 'exponent_n_steps': 9, 'learning_rate': 0.003513418717540876, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: 500.0.
[I 2025-02-27 10:55:04,914] Trial 29 pruned. 


Eval num_timesteps=10000, episode_reward=243.10 +/- 52.63
Episode length: 243.10 +/- 52.63
New best mean reward!


[I 2025-02-27 10:55:22,377] Trial 30 pruned. 


Eval num_timesteps=10000, episode_reward=143.10 +/- 96.58
Episode length: 143.10 +/- 96.58
New best mean reward!


[I 2025-02-27 10:55:36,296] Trial 31 pruned. 


Eval num_timesteps=10000, episode_reward=116.70 +/- 10.77
Episode length: 116.70 +/- 10.77
New best mean reward!


[I 2025-02-27 10:55:50,529] Trial 32 pruned. 


Eval num_timesteps=10000, episode_reward=276.20 +/- 151.79
Episode length: 276.20 +/- 151.79
New best mean reward!


[I 2025-02-27 10:56:04,896] Trial 33 pruned. 


Eval num_timesteps=10000, episode_reward=274.10 +/- 23.30
Episode length: 274.10 +/- 23.30
New best mean reward!


[I 2025-02-27 10:56:19,283] Trial 34 pruned. 


Eval num_timesteps=10000, episode_reward=260.10 +/- 14.40
Episode length: 260.10 +/- 14.40
New best mean reward!


[I 2025-02-27 10:56:33,825] Trial 35 pruned. 


Eval num_timesteps=10000, episode_reward=385.50 +/- 88.12
Episode length: 385.50 +/- 88.12
New best mean reward!


[I 2025-02-27 10:56:50,838] Trial 36 pruned. 


Eval num_timesteps=10000, episode_reward=100.40 +/- 31.37
Episode length: 100.40 +/- 31.37
New best mean reward!


[I 2025-02-27 10:57:06,060] Trial 37 pruned. 


Eval num_timesteps=10000, episode_reward=324.90 +/- 130.33
Episode length: 324.90 +/- 130.33
New best mean reward!


[I 2025-02-27 10:57:26,699] Trial 38 pruned. 


Eval num_timesteps=10000, episode_reward=328.00 +/- 153.03
Episode length: 328.00 +/- 153.03
New best mean reward!


[I 2025-02-27 10:57:41,467] Trial 39 pruned. 


Eval num_timesteps=10000, episode_reward=444.00 +/- 85.90
Episode length: 444.00 +/- 85.90
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00


[I 2025-02-27 10:58:18,664] Trial 40 finished with value: 500.0 and parameters: {'gamma': 0.000562774445116124, 'max_grad_norm': 1.150926670437787, 'exponent_n_steps': 10, 'learning_rate': 0.004958933595768048, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 500.0.
[I 2025-02-27 10:58:35,680] Trial 41 pruned. 


Eval num_timesteps=10000, episode_reward=499.00 +/- 3.00
Episode length: 499.00 +/- 3.00
New best mean reward!


[I 2025-02-27 10:58:53,087] Trial 42 pruned. 


Eval num_timesteps=10000, episode_reward=474.10 +/- 46.64
Episode length: 474.10 +/- 46.64
New best mean reward!


[I 2025-02-27 10:59:13,057] Trial 43 pruned. 


Eval num_timesteps=10000, episode_reward=498.90 +/- 3.30
Episode length: 498.90 +/- 3.30
New best mean reward!


[I 2025-02-27 10:59:29,994] Trial 44 pruned. 


Eval num_timesteps=10000, episode_reward=358.60 +/- 102.40
Episode length: 358.60 +/- 102.40
New best mean reward!
Eval num_timesteps=10000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=459.60 +/- 83.28
Episode length: 459.60 +/- 83.28


[I 2025-02-27 11:00:07,369] Trial 45 finished with value: 459.6 and parameters: {'gamma': 0.000162580062339865, 'max_grad_norm': 1.0618780596956188, 'exponent_n_steps': 10, 'learning_rate': 0.0001475677690032432, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 500.0.
[W 2025-02-27 11:00:19,527] Trial 46 failed with parameters: {'gamma': 0.00012718749226496248, 'max_grad_norm': 1.2408702842864952, 'exponent_n_steps': 10, 'learning_rate': 0.6932250915055316, 'net_arch': 'small', 'activation_fn': 'relu'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-7-0bf39c0f6ab9>", line 38, in objective
    model.learn(N_TIMESTEPS, callback=eval_callback)
  File "/usr/local/lib/python3.11/dist-packages/stable_baselines3/ppo/ppo.py", line 311, in le

Number of finished trials:  47
Best trial:
  Value: 500.0
  Params: 
    gamma: 0.09037775362786567
    max_grad_norm: 1.1227816917226559
    exponent_n_steps: 10
    learning_rate: 0.001995156512507026
    net_arch: small
    activation_fn: relu
  User attrs:
    gamma_: 0.9096222463721343
    n_steps: 1024


In [36]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
import torch.nn as nn

#env = gym.make("LunarLander-v3")
policy_kwargs={
            "net_arch": {"pi": [64, 64], "vf": [64, 64]},
            "activation_fn": nn.ReLU,
        }
env = gym.make("CartPole-v1", sutton_barto_reward=True)
model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=4096,
    batch_size=128,
    n_epochs=40,
    gamma=0.99,
    max_grad_norm= 1.1227816917226559,
    policy_kwargs=policy_kwargs,
    learning_rate= 0.001995156512507026
)

model.learn(total_timesteps=20000)
# Save the model
model_name = "ppo-CartPole-v1"
model.save(model_name)

In [37]:
evaluate_policy(model=model, env=env, deterministic=True, n_eval_episodes= 200)

(0.0, 0.0)

In [28]:
# import gymnasium as gym
# import numpy as np
# from stable_baselines3 import PPO

# env = gym.make("LunarLander-v3", render_mode="human")
model = PPO.load("ppo-CartPole-v1.zip")

# obs, info = env.reset()
# while True:
#     action, _states = model.predict(obs)
#     obs, reward, terminated, truncated, info = env.step(action)
#     if terminated or truncated:
#         break
# env.close()