# Imports

In [1]:
import os
import gym
import wandb
import numpy as np
import time
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Set up Weights & Biases

In [2]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcschmidl[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Training function for single environment

In [3]:
def train_agent_single_env(config):
    start_time = time.time()

    # Create a gym environment
    env_name = "CartPole-v1"
    env = gym.make(env_name)

    # Wrap the environment in a VecEnv
    env = DummyVecEnv([lambda: env])

    # Set up evaluation environment
    eval_env = gym.make(env_name)
    eval_env = DummyVecEnv([lambda: eval_env])

    # configure the PPo agent
    model = PPO(
        "MlpPolicy",
        env, 
        verbose=1,
        n_steps=config["n_steps"],
        gamma=config["gamma"],
        learning_rate=config["learning_rate"],
        ent_coef=config["ent_coef"],
        clip_range=config["clip_range"],
        n_epochs=config["n_epochs"],
        gae_lambda=config["gae_lambda"],
        max_grad_norm=config["max_grad_norm"]
    )

    # Set up an evaluation callback
    eval_callback = EvalCallback(
        eval_env,
        best_model_save_path="./models/",
        log_path="./logs/",
        eval_freq=500,
        deterministic=True,
        render=False
    )

    # Train the PPo agent
    model.learn(total_timesteps=config["total_timesteps"], callback=eval_callback)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Single environment training took {elapsed_time:.2f} seconds.")

    # Evaluate the trained agent
    mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

    return mean_reward

# Training function for multi environment

In [4]:
def make_env(env_id, seed):
    def _init():
        env = gym.make(env_id)
        env.seed(seed)
        return env
    return _init

In [15]:
seeds = np.random.randint(0, 2**32 - 1, 4)
print(seeds)

for seed in seeds:
    print(type(seed))

[2848719473  263985200 2543817249 3150862783]
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>


In [16]:
def train_agent_multi_env(config, n_envs=4):
    start_time = time.time()

    # Create a gym environment
    env_name = "CartPole-v1"
    n_envs = n_envs,
    #seeds = np.random.randint(0, int(1e6), n_envs)
    seeds = np.random.randint(0, 2**32 - 1, n_envs)

    envs = [make_env(env_name, int(seed)) for seed in seeds]
    env = SubprocVecEnv(envs)

    # Set up evaluation environment
    eval_env = gym.make(env_name)
    eval_env = DummyVecEnv([lambda: eval_env])

    # Configure the PPO agent
    model = PPO(
        "MlpPolicy",
        env,
        verbose=1,
        n_steps=config["n_steps"],
        gamma=config["gamma"],
        learning_rate=config["learning_rate"],
        ent_coef=config["ent_coef"],
        clip_range=config["clip_range"],
        n_epochs=config["n_epochs"],
        gae_lambda=config["gae_lambda"],
        max_grad_norm=config["max_grad_norm"],
    )

    # Set up an evaluation callback
    eval_callback = EvalCallback(
        eval_env, 
        best_model_save_path="./models/",
        log_path="./logs/",
        eval_freq=500,
        deterministic=True,
        render=False
    )

    # Train the PPO agent
    model.learn(total_timesteps=config["total_timesteps"], callback=eval_callback)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Multi-environment training took {elapsed_time:.2f} seconds.")

    # Evaluate the trained agent
    mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)



    return mean_reward

# Set up Weights & Biases sweep

In [17]:
sweep_config_single = {
    "name": "ppo_hyperparameter_tuning_single",
    "method": "random",
    "metric": {"goal": "maximize", "name": "eval/mean_reward"},
    "parameters": {
        "n_steps": {"min": 64, "max": 2048, "distribution": "int_uniform"},
        "gamma": {"min": 0.9, "max": 0.999, "distribution": "uniform"},
        "learning_rate": {"min": 1e-5, "max": 1e-2, "distribution": "uniform"},
        "ent_coef": {"min": 1e-6, "max": 1e-2, "distribution": "uniform"},
        "clip_range": {"min": 0.1, "max": 0.3, "distribution": "uniform"},
        "n_epochs": {"min": 1, "max": 10, "distribution": "int_uniform"},
        "gae_lambda": {"min": 0.9, "max": 1.0, "distribution": "uniform"},
        "max_grad_norm": {"min": 0.1, "max": 10, "distribution": "uniform"},
        "total_timesteps": {"min": 10_000, "max": 100_000, "distribution": "int_uniform"}
    },
}

sweep_config_multi = {
    "name": "ppo_hyperparameter_tuning_multi",
    "method": "random",
    "metric": {"goal": "maximize", "name": "eval/mean_reward"},
    "parameters": {
        "n_steps": {"min": 64, "max": 2048, "distribution": "int_uniform"},
        "gamma": {"min": 0.9, "max": 0.999, "distribution": "uniform"},
        "learning_rate": {"min": 1e-5, "max": 1e-2, "distribution": "uniform"},
        "ent_coef": {"min": 1e-6, "max": 1e-2, "distribution": "uniform"},
        "clip_range": {"min": 0.1, "max": 0.3, "distribution": "uniform"},
        "n_epochs": {"min": 1, "max": 10, "distribution": "int_uniform"},
        "gae_lambda": {"min": 0.9, "max": 1.0, "distribution": "uniform"},
        "max_grad_norm": {"min": 0.1, "max": 10, "distribution": "uniform"},
        "total_timesteps": {"min": 10_000, "max": 100_000, "distribution": "int_uniform"}
    },
}

single_sweep_id = wandb.sweep(sweep_config_single, project="ppo_hyperparamter_tuning")
multi_sweep_id = wandb.sweep(sweep_config_multi, project="ppo_hyperparamter_tuning")

Create sweep with ID: a390ceqc
Sweep URL: https://wandb.ai/cschmidl/ppo_hyperparamter_tuning/sweeps/a390ceqc
Create sweep with ID: b30md251
Sweep URL: https://wandb.ai/cschmidl/ppo_hyperparamter_tuning/sweeps/b30md251


# Define the sweep function

In [13]:
def sweep_agent_single():
    with wandb.init() as run:
        config = run.config
        mean_reward = train_agent_single_env(config)
        wandb.log({"eval/mean_reward": mean_reward})

def sweep_agent_multi():
    with wandb.init() as run:
        config = run.config
        mean_reward = train_agent_multi_env(config, n_envs=4)
        wandb.log({"eval/mean_reward": mean_reward})


# Run the sweep: Single env

In [8]:
# Single
wandb.agent(single_sweep_id, function=sweep_agent_single, count=20)


[34m[1mwandb[0m: Agent Starting Run: ec1zlvay with config:
[34m[1mwandb[0m: 	clip_range: 0.24801498317259676
[34m[1mwandb[0m: 	ent_coef: 0.005259804550102741
[34m[1mwandb[0m: 	gae_lambda: 0.9872416499332052
[34m[1mwandb[0m: 	gamma: 0.9187271553667176
[34m[1mwandb[0m: 	learning_rate: 0.004104321568970994
[34m[1mwandb[0m: 	max_grad_norm: 1.96005489016267
[34m[1mwandb[0m: 	n_epochs: 9
[34m[1mwandb[0m: 	n_steps: 535
[34m[1mwandb[0m: 	total_timesteps: 24494
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=535 and n_envs=1)


Eval num_timesteps=500, episode_reward=9.40 +/- 0.80
Episode length: 9.40 +/- 0.80
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.4      |
|    mean_reward     | 9.4      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1336 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 535  |
-----------------------------




Eval num_timesteps=1000, episode_reward=158.00 +/- 171.13
Episode length: 158.00 +/- 171.13
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 158         |
|    mean_reward          | 158         |
| time/                   |             |
|    total_timesteps      | 1000        |
| train/                  |             |
|    approx_kl            | 0.016432984 |
|    clip_fraction        | 0.213       |
|    clip_range           | 0.248       |
|    entropy_loss         | -0.675      |
|    explained_variance   | -0.0104     |
|    learning_rate        | 0.0041      |
|    loss                 | 1.09        |
|    n_updates            | 9           |
|    policy_gradient_loss | -0.0254     |
|    value_loss           | 14.5        |
-----------------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 759  |
|    iterations      | 2    |
|    time_

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,405.0


[34m[1mwandb[0m: Agent Starting Run: ltmd9ibv with config:
[34m[1mwandb[0m: 	clip_range: 0.13868505930966096
[34m[1mwandb[0m: 	ent_coef: 0.004785396253998942
[34m[1mwandb[0m: 	gae_lambda: 0.9424297839457756
[34m[1mwandb[0m: 	gamma: 0.9926226867607312
[34m[1mwandb[0m: 	learning_rate: 0.001385788292958073
[34m[1mwandb[0m: 	max_grad_norm: 6.710536429049587
[34m[1mwandb[0m: 	n_epochs: 5
[34m[1mwandb[0m: 	n_steps: 1218
[34m[1mwandb[0m: 	total_timesteps: 49397
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1218 and n_envs=1)


Eval num_timesteps=500, episode_reward=93.20 +/- 15.32
Episode length: 93.20 +/- 15.32
---------------------------------
| eval/              |          |
|    mean_ep_length  | 93.2     |
|    mean_reward     | 93.2     |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
Eval num_timesteps=1000, episode_reward=96.00 +/- 9.90
Episode length: 96.00 +/- 9.90
---------------------------------
| eval/              |          |
|    mean_ep_length  | 96       |
|    mean_reward     | 96       |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 970  |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1218 |
-----------------------------
Eval num_timesteps=1500, episode_reward=164.60 +/- 117.23
Episode length: 164.60 +/- 117.23
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 165          |
|    mean_reward          | 165          |
| time/                   |              |
|    total_timesteps      | 1500         |
| train/                  |              |
|    approx_kl            | 0

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: maf5t0pd with config:
[34m[1mwandb[0m: 	clip_range: 0.1762925903999336
[34m[1mwandb[0m: 	ent_coef: 0.009679510913082995
[34m[1mwandb[0m: 	gae_lambda: 0.9375607636666896
[34m[1mwandb[0m: 	gamma: 0.9701260016674802
[34m[1mwandb[0m: 	learning_rate: 0.004246935904103854
[34m[1mwandb[0m: 	max_grad_norm: 1.9158946434987156
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	n_steps: 460
[34m[1mwandb[0m: 	total_timesteps: 44797
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=460 and n_envs=1)


-----------------------------
| time/              |      |
|    fps             | 1671 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 460  |
-----------------------------
Eval num_timesteps=500, episode_reward=57.80 +/- 6.76
Episode length: 57.80 +/- 6.76
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 57.8        |
|    mean_reward          | 57.8        |
| time/                   |             |
|    total_timesteps      | 500         |
| train/                  |             |
|    approx_kl            | 0.009606323 |
|    clip_fraction        | 0.153       |
|    clip_range           | 0.176       |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.0126     |
|    learning_rate        | 0.00425     |
|    loss                 | 3.1         |
|    n_updates            | 3           |
|    policy_gradient_loss | -0.0115     |
|    value_loss           | 33.5 



-----------------------------
| time/              |      |
|    fps             | 1309 |
|    iterations      | 2    |
|    time_elapsed    | 0    |
|    total_timesteps | 920  |
-----------------------------
Eval num_timesteps=1000, episode_reward=282.80 +/- 29.67
Episode length: 282.80 +/- 29.67
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 283         |
|    mean_reward          | 283         |
| time/                   |             |
|    total_timesteps      | 1000        |
| train/                  |             |
|    approx_kl            | 0.014602743 |
|    clip_fraction        | 0.16        |
|    clip_range           | 0.176       |
|    entropy_loss         | -0.673      |
|    explained_variance   | 0.0609      |
|    learning_rate        | 0.00425     |
|    loss                 | 10.7        |
|    n_updates            | 6           |
|    policy_gradient_loss | 0.000191    |
|    value_loss           | 

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: izrwlfhc with config:
[34m[1mwandb[0m: 	clip_range: 0.2892569697834375
[34m[1mwandb[0m: 	ent_coef: 0.006980804007713124
[34m[1mwandb[0m: 	gae_lambda: 0.9924020261966444
[34m[1mwandb[0m: 	gamma: 0.921687619532182
[34m[1mwandb[0m: 	learning_rate: 0.005588395986800764
[34m[1mwandb[0m: 	max_grad_norm: 0.37108214483521473
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	n_steps: 997
[34m[1mwandb[0m: 	total_timesteps: 55757
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=997 and n_envs=1)


Eval num_timesteps=500, episode_reward=9.20 +/- 0.75
Episode length: 9.20 +/- 0.75
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.2      |
|    mean_reward     | 9.2      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1874 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 997  |
-----------------------------
Eval num_timesteps=1000, episode_reward=255.80 +/- 149.41
Episode length: 255.80 +/- 149.41
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 256         |
|    mean_reward          | 256         |
| time/                   |             |
|    total_timesteps      | 1000        |
| train/                  |             |
|    approx_kl            | 0.029687017 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.289       |
|    entropy_loss         | -0.672      |
|    explained_variance   | -0.0083     |
|    learning_rate        | 0.00559     |
|    loss                 | 2.82        |
|    n_updates            | 3           |
|    policy_gradient_loss | -0.0257     |
|   

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,417.3


[34m[1mwandb[0m: Agent Starting Run: 384yu8dq with config:
[34m[1mwandb[0m: 	clip_range: 0.2697110407148269
[34m[1mwandb[0m: 	ent_coef: 0.008182937435302151
[34m[1mwandb[0m: 	gae_lambda: 0.9884067073825544
[34m[1mwandb[0m: 	gamma: 0.9851558093610108
[34m[1mwandb[0m: 	learning_rate: 0.005911299597872144
[34m[1mwandb[0m: 	max_grad_norm: 9.629033075280216
[34m[1mwandb[0m: 	n_epochs: 7
[34m[1mwandb[0m: 	n_steps: 684
[34m[1mwandb[0m: 	total_timesteps: 16671
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=684 and n_envs=1)


Eval num_timesteps=500, episode_reward=279.80 +/- 103.05
Episode length: 279.80 +/- 103.05
---------------------------------
| eval/              |          |
|    mean_ep_length  | 280      |
|    mean_reward     | 280      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------
New best mean reward!
----------------------------
| time/              |     |
|    fps             | 786 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 684 |
----------------------------
Eval num_timesteps=1000, episode_reward=428.60 +/- 142.80
Episode length: 428.60 +/- 142.80
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 429        |
|    mean_reward          | 429        |
| time/                   |            |
|    total_timesteps      | 1000       |
| train/                  |            |
|    approx_kl            | 0.03761311 |
|    clip_fraction        

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: socqhpcv with config:
[34m[1mwandb[0m: 	clip_range: 0.28807872877562746
[34m[1mwandb[0m: 	ent_coef: 0.002661551855965269
[34m[1mwandb[0m: 	gae_lambda: 0.905870808126836
[34m[1mwandb[0m: 	gamma: 0.9644004062571392
[34m[1mwandb[0m: 	learning_rate: 0.0035659444257926266
[34m[1mwandb[0m: 	max_grad_norm: 6.450301574876193
[34m[1mwandb[0m: 	n_epochs: 4
[34m[1mwandb[0m: 	n_steps: 1132
[34m[1mwandb[0m: 	total_timesteps: 35969
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1132 and n_envs=1)


Eval num_timesteps=500, episode_reward=9.80 +/- 0.98
Episode length: 9.80 +/- 0.98
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.8      |
|    mean_reward     | 9.8      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
Eval num_timesteps=1000, episode_reward=9.80 +/- 0.40
Episode length: 9.80 +/- 0.40
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.8      |
|    mean_reward     | 9.8      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 1819 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1132 |
-----------------------------
Eval num_timesteps=1500, episode_reward=487.80 +/- 24.40
Episode length: 487.80 +/- 24.40
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 488        |
|    mean_reward          | 488        |
| time/                   |            |
|    total_timesteps      | 1500       |
| train/                  |            |
|    approx_kl            | 0.01986767 |
|    clip_fraction        | 

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: k1uhhekk with config:
[34m[1mwandb[0m: 	clip_range: 0.1299608663676622
[34m[1mwandb[0m: 	ent_coef: 0.00969745839950704
[34m[1mwandb[0m: 	gae_lambda: 0.93800791038971
[34m[1mwandb[0m: 	gamma: 0.9113254580941896
[34m[1mwandb[0m: 	learning_rate: 0.00952214721600086
[34m[1mwandb[0m: 	max_grad_norm: 1.362056748161388
[34m[1mwandb[0m: 	n_epochs: 10
[34m[1mwandb[0m: 	n_steps: 1803
[34m[1mwandb[0m: 	total_timesteps: 79321
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1803 and n_envs=1)


Eval num_timesteps=500, episode_reward=8.40 +/- 0.80
Episode length: 8.40 +/- 0.80
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.4      |
|    mean_reward     | 8.4      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
Eval num_timesteps=1000, episode_reward=8.80 +/- 0.40
Episode length: 8.80 +/- 0.40
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.8      |
|    mean_reward     | 8.8      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
Eval num_timesteps=1500, episode_reward=9.40 +/- 0.49
Episode length: 9.40 +/- 0.49
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.4      |
|    mean_reward     | 9.4      |
| time/              |          |
|    total_timesteps | 1500     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1772 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1803 |
-----------------------------
Eval num_timesteps=2000, episode_reward=182.20 +/- 31.22
Episode length: 182.20 

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,251.8


[34m[1mwandb[0m: Agent Starting Run: cjyosavt with config:
[34m[1mwandb[0m: 	clip_range: 0.2473549928026329
[34m[1mwandb[0m: 	ent_coef: 0.0002203437209263696
[34m[1mwandb[0m: 	gae_lambda: 0.9164672597813148
[34m[1mwandb[0m: 	gamma: 0.9498650779535368
[34m[1mwandb[0m: 	learning_rate: 0.009852906142059435
[34m[1mwandb[0m: 	max_grad_norm: 2.184831490047454
[34m[1mwandb[0m: 	n_epochs: 6
[34m[1mwandb[0m: 	n_steps: 1239
[34m[1mwandb[0m: 	total_timesteps: 83769
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1239 and n_envs=1)


Eval num_timesteps=500, episode_reward=8.60 +/- 0.49
Episode length: 8.60 +/- 0.49
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.6      |
|    mean_reward     | 8.6      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
Eval num_timesteps=1000, episode_reward=9.00 +/- 0.00
Episode length: 9.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9        |
|    mean_reward     | 9        |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1402 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1239 |
-----------------------------
Eval num_timesteps=1500, episode_reward=104.00 +/- 7.40
Episode length: 104.00 +/- 7.40
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 104         |
|    mean_reward          | 104         |
| time/                   |             |
|    total_timesteps      | 1500        |
| train/                  |             |
|    approx_kl            | 0.024947602 |


0,1
eval/mean_reward,▁

0,1
eval/mean_reward,154.9


[34m[1mwandb[0m: Agent Starting Run: 31fzcdem with config:
[34m[1mwandb[0m: 	clip_range: 0.20957637228720696
[34m[1mwandb[0m: 	ent_coef: 0.0034818015362356627
[34m[1mwandb[0m: 	gae_lambda: 0.9378237002052218
[34m[1mwandb[0m: 	gamma: 0.98153159353979
[34m[1mwandb[0m: 	learning_rate: 0.001194324075765231
[34m[1mwandb[0m: 	max_grad_norm: 8.788583518270771
[34m[1mwandb[0m: 	n_epochs: 6
[34m[1mwandb[0m: 	n_steps: 517
[34m[1mwandb[0m: 	total_timesteps: 23736
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=517 and n_envs=1)


Eval num_timesteps=500, episode_reward=333.80 +/- 203.89
Episode length: 333.80 +/- 203.89
---------------------------------
| eval/              |          |
|    mean_ep_length  | 334      |
|    mean_reward     | 334      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------
New best mean reward!
----------------------------
| time/              |     |
|    fps             | 444 |
|    iterations      | 1   |
|    time_elapsed    | 1   |
|    total_timesteps | 517 |
----------------------------
Eval num_timesteps=1000, episode_reward=113.20 +/- 7.93
Episode length: 113.20 +/- 7.93
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 113         |
|    mean_reward          | 113         |
| time/                   |             |
|    total_timesteps      | 1000        |
| train/                  |             |
|    approx_kl            | 0.016470388 |
|    clip_fraction    

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: x7wb0wjf with config:
[34m[1mwandb[0m: 	clip_range: 0.1374481607323551
[34m[1mwandb[0m: 	ent_coef: 0.009735425180169654
[34m[1mwandb[0m: 	gae_lambda: 0.9756959002856878
[34m[1mwandb[0m: 	gamma: 0.9003759089141894
[34m[1mwandb[0m: 	learning_rate: 0.005230985670726304
[34m[1mwandb[0m: 	max_grad_norm: 0.9788839789054828
[34m[1mwandb[0m: 	n_epochs: 1
[34m[1mwandb[0m: 	n_steps: 1853
[34m[1mwandb[0m: 	total_timesteps: 64297
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1853 and n_envs=1)


Eval num_timesteps=500, episode_reward=9.60 +/- 0.49
Episode length: 9.60 +/- 0.49
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.6      |
|    mean_reward     | 9.6      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
Eval num_timesteps=1000, episode_reward=9.40 +/- 0.80
Episode length: 9.40 +/- 0.80
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.4      |
|    mean_reward     | 9.4      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
Eval num_timesteps=1500, episode_reward=9.20 +/- 0.40
Episode length: 9.20 +/- 0.40
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.2      |
|    mean_reward     | 9.2      |
| time/              |          |
|    total_timesteps | 1500     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 1053 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1853 |
-----------------------------
Eval num_timesteps=2000, episode_reward=15.00 +/- 6.26
Episode length: 15.00 +/- 6.26
--------------------------------------

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,204.9


[34m[1mwandb[0m: Agent Starting Run: z7wtrzz1 with config:
[34m[1mwandb[0m: 	clip_range: 0.196986950898157
[34m[1mwandb[0m: 	ent_coef: 0.0031477170199456597
[34m[1mwandb[0m: 	gae_lambda: 0.9184080285739772
[34m[1mwandb[0m: 	gamma: 0.915802251699358
[34m[1mwandb[0m: 	learning_rate: 0.005392630155081313
[34m[1mwandb[0m: 	max_grad_norm: 9.547772011181069
[34m[1mwandb[0m: 	n_epochs: 2
[34m[1mwandb[0m: 	n_steps: 1711
[34m[1mwandb[0m: 	total_timesteps: 69478
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1711 and n_envs=1)


Eval num_timesteps=500, episode_reward=9.40 +/- 0.80
Episode length: 9.40 +/- 0.80
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.4      |
|    mean_reward     | 9.4      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
Eval num_timesteps=1000, episode_reward=9.20 +/- 0.75
Episode length: 9.20 +/- 0.75
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.2      |
|    mean_reward     | 9.2      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
Eval num_timesteps=1500, episode_reward=8.20 +/- 0.40
Episode length: 8.20 +/- 0.40
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.2      |
|    mean_reward     | 8.2      |
| time/              |          |
|    total_timesteps | 1500     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 1494 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1711 |
-----------------------------
Eval num_timesteps=2000, episode_reward=160.20 +/- 145.12
Episode length: 160.20 +/- 145.12
--------------------------------

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,498.6


[34m[1mwandb[0m: Agent Starting Run: jkwh301z with config:
[34m[1mwandb[0m: 	clip_range: 0.1849857630551267
[34m[1mwandb[0m: 	ent_coef: 0.007180567750631555
[34m[1mwandb[0m: 	gae_lambda: 0.9942681549969496
[34m[1mwandb[0m: 	gamma: 0.9506654249791108
[34m[1mwandb[0m: 	learning_rate: 0.004436602202663294
[34m[1mwandb[0m: 	max_grad_norm: 8.707682615770127
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	n_steps: 1748
[34m[1mwandb[0m: 	total_timesteps: 81218
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1748 and n_envs=1)


Eval num_timesteps=500, episode_reward=8.80 +/- 0.75
Episode length: 8.80 +/- 0.75
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.8      |
|    mean_reward     | 8.8      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
Eval num_timesteps=1000, episode_reward=9.40 +/- 0.80
Episode length: 9.40 +/- 0.80
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.4      |
|    mean_reward     | 9.4      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
Eval num_timesteps=1500, episode_reward=10.00 +/- 0.00
Episode length: 10.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 10       |
|    mean_reward     | 10       |
| time/              |          |
|    total_timesteps | 1500     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1378 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1748 |
-----------------------------
Eval num_timesteps=2000, episode_reward=83.60 +/- 11.72
Episode length: 83.60 

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: ll06qkgr with config:
[34m[1mwandb[0m: 	clip_range: 0.29314870202467325
[34m[1mwandb[0m: 	ent_coef: 0.009570412220468533
[34m[1mwandb[0m: 	gae_lambda: 0.958532008728457
[34m[1mwandb[0m: 	gamma: 0.9248538784674732
[34m[1mwandb[0m: 	learning_rate: 0.007857759340494057
[34m[1mwandb[0m: 	max_grad_norm: 8.553789880818755
[34m[1mwandb[0m: 	n_epochs: 4
[34m[1mwandb[0m: 	n_steps: 1282
[34m[1mwandb[0m: 	total_timesteps: 16904
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1282 and n_envs=1)


Eval num_timesteps=500, episode_reward=8.80 +/- 0.75
Episode length: 8.80 +/- 0.75
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.8      |
|    mean_reward     | 8.8      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
Eval num_timesteps=1000, episode_reward=9.40 +/- 0.49
Episode length: 9.40 +/- 0.49
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.4      |
|    mean_reward     | 9.4      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 638  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 1282 |
-----------------------------
Eval num_timesteps=1500, episode_reward=154.80 +/- 117.34
Episode length: 154.80 +/- 117.34
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 155         |
|    mean_reward          | 155         |
| time/                   |             |
|    total_timesteps      | 1500        |
| train/                  |             |
|    approx_kl            | 0.03319135

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: boi8yjct with config:
[34m[1mwandb[0m: 	clip_range: 0.1508130227534313
[34m[1mwandb[0m: 	ent_coef: 0.009426977350216113
[34m[1mwandb[0m: 	gae_lambda: 0.9757205918062372
[34m[1mwandb[0m: 	gamma: 0.9058556536144868
[34m[1mwandb[0m: 	learning_rate: 0.008185328776173572
[34m[1mwandb[0m: 	max_grad_norm: 3.0563725417889787
[34m[1mwandb[0m: 	n_epochs: 1
[34m[1mwandb[0m: 	n_steps: 653
[34m[1mwandb[0m: 	total_timesteps: 37824
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=653 and n_envs=1)


Eval num_timesteps=500, episode_reward=18.80 +/- 7.52
Episode length: 18.80 +/- 7.52
---------------------------------
| eval/              |          |
|    mean_ep_length  | 18.8     |
|    mean_reward     | 18.8     |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
----------------------------
| time/              |     |
|    fps             | 507 |
|    iterations      | 1   |
|    time_elapsed    | 1   |
|    total_timesteps | 653 |
----------------------------
Eval num_timesteps=1000, episode_reward=317.20 +/- 100.74
Episode length: 317.20 +/- 100.74
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 317          |
|    mean_reward          | 317          |
| time/                   |              |
|    total_timesteps      | 1000         |
| train/                  |              |
|    approx_kl            | 0.0039205356 |
|    clip_fraction        | 0.0923       |
|    clip_range           | 0.151        |
|    entropy_loss         | -0.689       |
|    explained_variance   | -0.00968     |
|    learning_rate        | 0.00819      |
|    loss                 | 6.86         |
|    n_updates            | 1            |
|    policy_gradient_loss | -0.00152  

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,450.6


[34m[1mwandb[0m: Agent Starting Run: eqxegvos with config:
[34m[1mwandb[0m: 	clip_range: 0.2996068659099387
[34m[1mwandb[0m: 	ent_coef: 0.0012493095907159064
[34m[1mwandb[0m: 	gae_lambda: 0.9262006170642736
[34m[1mwandb[0m: 	gamma: 0.9623579638081332
[34m[1mwandb[0m: 	learning_rate: 0.00860166487035158
[34m[1mwandb[0m: 	max_grad_norm: 0.63623071112113
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	n_steps: 756
[34m[1mwandb[0m: 	total_timesteps: 54292
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=756 and n_envs=1)


Eval num_timesteps=500, episode_reward=40.60 +/- 7.61
Episode length: 40.60 +/- 7.61
---------------------------------
| eval/              |          |
|    mean_ep_length  | 40.6     |
|    mean_reward     | 40.6     |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1153 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 756  |
-----------------------------
Eval num_timesteps=1000, episode_reward=9.60 +/- 0.49
Episode length: 9.60 +/- 0.49
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 9.6         |
|    mean_reward          | 9.6         |
| time/                   |             |
|    total_timesteps      | 1000        |
| train/                  |             |
|    approx_kl            | 0.021646874 |
|    clip_fraction        | 0.063       |
|    clip_range           | 0.3         |
|    entropy_loss         | -0.68       |
|    explained_variance   | 0.00395     |
|    learning_rate        | 0.0086      |
|    loss                 | 1.64        |
|    n_updates            | 3           |
|    policy_gradient_loss | 0.000283    |
|    value_l

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,315.0


[34m[1mwandb[0m: Agent Starting Run: vnf3c7eq with config:
[34m[1mwandb[0m: 	clip_range: 0.23728679947549733
[34m[1mwandb[0m: 	ent_coef: 0.0016631842259084272
[34m[1mwandb[0m: 	gae_lambda: 0.9134182622221212
[34m[1mwandb[0m: 	gamma: 0.942158218417942
[34m[1mwandb[0m: 	learning_rate: 0.00979225000275359
[34m[1mwandb[0m: 	max_grad_norm: 4.928934873366703
[34m[1mwandb[0m: 	n_epochs: 5
[34m[1mwandb[0m: 	n_steps: 879
[34m[1mwandb[0m: 	total_timesteps: 69361
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=879 and n_envs=1)


Eval num_timesteps=500, episode_reward=9.20 +/- 0.40
Episode length: 9.20 +/- 0.40
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.2      |
|    mean_reward     | 9.2      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1632 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 879  |
-----------------------------
Eval num_timesteps=1000, episode_reward=189.40 +/- 37.59
Episode length: 189.40 +/- 37.59
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 189         |
|    mean_reward          | 189         |
| time/                   |             |
|    total_timesteps      | 1000        |
| train/                  |             |
|    approx_kl            | 0.024092585 |
|    clip_fraction        | 0.24        |
|    clip_range           | 0.237       |
|    entropy_loss         | -0.674      |
|    explained_variance   | -0.0206     |
|    learning_rate        | 0.00979     |
|    loss                 | 0.647       |
|    n_updates            | 5           |
|    policy_gradient_loss | -0.0232     |
|    v

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: 3nsyighl with config:
[34m[1mwandb[0m: 	clip_range: 0.1678469335539313
[34m[1mwandb[0m: 	ent_coef: 0.0014188947493194357
[34m[1mwandb[0m: 	gae_lambda: 0.9469893730279674
[34m[1mwandb[0m: 	gamma: 0.9479189901092728
[34m[1mwandb[0m: 	learning_rate: 0.007317704867293711
[34m[1mwandb[0m: 	max_grad_norm: 0.6012951338231856
[34m[1mwandb[0m: 	n_epochs: 9
[34m[1mwandb[0m: 	n_steps: 1782
[34m[1mwandb[0m: 	total_timesteps: 85290
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1782 and n_envs=1)


Eval num_timesteps=500, episode_reward=51.20 +/- 13.96
Episode length: 51.20 +/- 13.96
---------------------------------
| eval/              |          |
|    mean_ep_length  | 51.2     |
|    mean_reward     | 51.2     |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
Eval num_timesteps=1000, episode_reward=97.80 +/- 29.69
Episode length: 97.80 +/- 29.69
---------------------------------
| eval/              |          |
|    mean_ep_length  | 97.8     |
|    mean_reward     | 97.8     |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
Eval num_timesteps=1500, episode_reward=58.80 +/- 21.19
Episode length: 58.80 +/- 21.19
---------------------------------
| eval/              |          |
|    mean_ep_length  | 58.8     |
|    mean_reward     | 58.8     |
| time/              |          |
|    total_timesteps | 1500     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 1484 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1782 |
-----------------------------
Eval num_timesteps=2000, episode_reward=275.80 +/- 62.69
Episode length: 275.80 +/- 62.69
----

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,253.3


[34m[1mwandb[0m: Agent Starting Run: 7oqe6lm9 with config:
[34m[1mwandb[0m: 	clip_range: 0.2924289164270875
[34m[1mwandb[0m: 	ent_coef: 0.006625633589261951
[34m[1mwandb[0m: 	gae_lambda: 0.9817254868830296
[34m[1mwandb[0m: 	gamma: 0.9217850917543364
[34m[1mwandb[0m: 	learning_rate: 0.009857731630951496
[34m[1mwandb[0m: 	max_grad_norm: 1.053873457159194
[34m[1mwandb[0m: 	n_epochs: 1
[34m[1mwandb[0m: 	n_steps: 1376
[34m[1mwandb[0m: 	total_timesteps: 79389
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1376 and n_envs=1)


Eval num_timesteps=500, episode_reward=11.80 +/- 2.32
Episode length: 11.80 +/- 2.32
---------------------------------
| eval/              |          |
|    mean_ep_length  | 11.8     |
|    mean_reward     | 11.8     |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------




New best mean reward!
Eval num_timesteps=1000, episode_reward=9.80 +/- 2.64
Episode length: 9.80 +/- 2.64
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.8      |
|    mean_reward     | 9.8      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 665  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 1376 |
-----------------------------
Eval num_timesteps=1500, episode_reward=219.40 +/- 77.16
Episode length: 219.40 +/- 77.16
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 219         |
|    mean_reward          | 219         |
| time/                   |             |
|    total_timesteps      | 1500        |
| train/                  |             |
|    approx_kl            | 0.015136882 |
|    clip_fraction  

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,174.0


[34m[1mwandb[0m: Agent Starting Run: p9pbrtbo with config:
[34m[1mwandb[0m: 	clip_range: 0.17538614586038498
[34m[1mwandb[0m: 	ent_coef: 0.004948865621239801
[34m[1mwandb[0m: 	gae_lambda: 0.9022874465577274
[34m[1mwandb[0m: 	gamma: 0.9331839741723236
[34m[1mwandb[0m: 	learning_rate: 0.006928698267084203
[34m[1mwandb[0m: 	max_grad_norm: 8.137605341865102
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	n_steps: 1181
[34m[1mwandb[0m: 	total_timesteps: 32133
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1181 and n_envs=1)


Eval num_timesteps=500, episode_reward=125.60 +/- 62.26
Episode length: 125.60 +/- 62.26
---------------------------------
| eval/              |          |
|    mean_ep_length  | 126      |
|    mean_reward     | 126      |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------
New best mean reward!
Eval num_timesteps=1000, episode_reward=136.80 +/- 66.04
Episode length: 136.80 +/- 66.04
---------------------------------
| eval/              |          |
|    mean_ep_length  | 137      |
|    mean_reward     | 137      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 911  |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1181 |
-----------------------------
Eval num_timesteps=1500, episode_reward=72.80 +/- 7.86
Episode length: 72.80 +/- 7.86
-----

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,211.1


[34m[1mwandb[0m: Agent Starting Run: 2871tio1 with config:
[34m[1mwandb[0m: 	clip_range: 0.13800909172177428
[34m[1mwandb[0m: 	ent_coef: 0.0006701640118988939
[34m[1mwandb[0m: 	gae_lambda: 0.9937755798080838
[34m[1mwandb[0m: 	gamma: 0.9087862201061372
[34m[1mwandb[0m: 	learning_rate: 0.0025920242175827436
[34m[1mwandb[0m: 	max_grad_norm: 8.878908130756182
[34m[1mwandb[0m: 	n_epochs: 5
[34m[1mwandb[0m: 	n_steps: 189
[34m[1mwandb[0m: 	total_timesteps: 42435
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1171 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 189  |
-----------------------------


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=189 and n_envs=1)


------------------------------------------
| time/                   |              |
|    fps                  | 924          |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 378          |
| train/                  |              |
|    approx_kl            | 0.0010510014 |
|    clip_fraction        | 0.0456       |
|    clip_range           | 0.138        |
|    entropy_loss         | -0.691       |
|    explained_variance   | -0.025       |
|    learning_rate        | 0.00259      |
|    loss                 | 16.6         |
|    n_updates            | 5            |
|    policy_gradient_loss | -0.00191     |
|    value_loss           | 42.3         |
------------------------------------------
Eval num_timesteps=500, episode_reward=36.80 +/- 9.58
Episode length: 36.80 +/- 9.58
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 36.8        |
|    mean_rewar



-----------------------------------------
| time/                   |             |
|    fps                  | 856         |
|    iterations           | 4           |
|    time_elapsed         | 0           |
|    total_timesteps      | 756         |
| train/                  |             |
|    approx_kl            | 0.003568032 |
|    clip_fraction        | 0.00963     |
|    clip_range           | 0.138       |
|    entropy_loss         | -0.684      |
|    explained_variance   | -0.107      |
|    learning_rate        | 0.00259     |
|    loss                 | 3.4         |
|    n_updates            | 15          |
|    policy_gradient_loss | -0.00328    |
|    value_loss           | 9.74        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 825         |
|    iterations           | 5           |
|    time_elapsed         | 1           |
|    total_timesteps      | 945   

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,128.9


[34m[1mwandb[0m: Agent Starting Run: 1bahrtuk with config:
[34m[1mwandb[0m: 	clip_range: 0.198388751321804
[34m[1mwandb[0m: 	ent_coef: 0.0035612128843734293
[34m[1mwandb[0m: 	gae_lambda: 0.9935998995645532
[34m[1mwandb[0m: 	gamma: 0.909121931564672
[34m[1mwandb[0m: 	learning_rate: 0.0049397506384296136
[34m[1mwandb[0m: 	max_grad_norm: 6.175786490883611
[34m[1mwandb[0m: 	n_epochs: 2
[34m[1mwandb[0m: 	n_steps: 1286
[34m[1mwandb[0m: 	total_timesteps: 95409
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Process ForkServerProcess-4:
Process ForkServerProcess-3:
Process ForkServerProcess-2:
Process ForkServerProcess-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-pack

Run 1bahrtuk errored: EOFError()
[34m[1mwandb[0m: [32m[41mERROR[0m Run 1bahrtuk errored: EOFError()
[34m[1mwandb[0m: Agent Starting Run: 5jnaseew with config:
[34m[1mwandb[0m: 	clip_range: 0.27941671028997983
[34m[1mwandb[0m: 	ent_coef: 0.008131621745903757
[34m[1mwandb[0m: 	gae_lambda: 0.9674789546052766
[34m[1mwandb[0m: 	gamma: 0.9263553900546296
[34m[1mwandb[0m: 	learning_rate: 0.007256660913341136
[34m[1mwandb[0m: 	max_grad_norm: 2.3836737242784025
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	n_steps: 1741
[34m[1mwandb[0m: 	total_timesteps: 97049
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Process ForkServerProcess-5:
Traceback (most recent call last):
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 25, in _worker
    env = env_fn_wrapper.var()
  File "/var/folders/w3/204l8n9n6g1c3vgzxxhhh7500000gn/T/ipykernel_87305/3773973735.py", line 4, in _init
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-packages/gym/core.py", line 301, in seed
    return self.env.seed(seed)
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-packages/gym/envs/classic_control/cartpole.py", line 100,

Process ForkServerProcess-8:
Traceback (most recent call last):
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 25, in _worker
    env = env_fn_wrapper.var()
  File "/var/folders/w3/204l8n9n6g1c3vgzxxhhh7500000gn/T/ipykernel_87305/3773973735.py", line 4, in _init
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-packages/gym/core.py", line 301, in seed
    return self.env.seed(seed)
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-packages/gym/envs/classic_control/cartpole.py", line 100,

Run 5jnaseew errored: EOFError()
[34m[1mwandb[0m: [32m[41mERROR[0m Run 5jnaseew errored: EOFError()
[34m[1mwandb[0m: Agent Starting Run: m7u27qj0 with config:
[34m[1mwandb[0m: 	clip_range: 0.1548381238349878
[34m[1mwandb[0m: 	ent_coef: 0.00863732323016807
[34m[1mwandb[0m: 	gae_lambda: 0.9040056905994543
[34m[1mwandb[0m: 	gamma: 0.9354049976651054
[34m[1mwandb[0m: 	learning_rate: 0.007499545892770019
[34m[1mwandb[0m: 	max_grad_norm: 7.435089408020336
[34m[1mwandb[0m: 	n_epochs: 5
[34m[1mwandb[0m: 	n_steps: 590
[34m[1mwandb[0m: 	total_timesteps: 51464
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Process ForkServerProcess-10:
Process ForkServerProcess-9:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 25, in _worker
    env = env_fn_wrapper.var()
  File "/var/folders/w3/204l8n9n6g1c3vgzxxhhh7500000gn/T/ipykernel_87305/3773973735.py", line 4, in _init
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/cschmidl/phd/repos/wandb-playg

Process ForkServerProcess-12:
Traceback (most recent call last):
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 25, in _worker
    env = env_fn_wrapper.var()
  File "/var/folders/w3/204l8n9n6g1c3vgzxxhhh7500000gn/T/ipykernel_87305/3773973735.py", line 4, in _init
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-packages/gym/core.py", line 301, in seed
    return self.env.seed(seed)
  File "/Users/cschmidl/phd/repos/wandb-playground/venv/lib/python3.9/site-packages/gym/envs/classic_control/cartpole.py", line 100

Run m7u27qj0 errored: EOFError()
[34m[1mwandb[0m: [32m[41mERROR[0m Run m7u27qj0 errored: EOFError()
Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: [32m[41mERROR[0m Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true


# Run the sweep: Multi env

In [18]:
wandb.agent(multi_sweep_id, function=sweep_agent_multi, count=20)

[34m[1mwandb[0m: Agent Starting Run: cl7g958b with config:
[34m[1mwandb[0m: 	clip_range: 0.2511996313412903
[34m[1mwandb[0m: 	ent_coef: 0.008940710676994899
[34m[1mwandb[0m: 	gae_lambda: 0.948714940686066
[34m[1mwandb[0m: 	gamma: 0.9043587061935928
[34m[1mwandb[0m: 	learning_rate: 0.0004970508894585218
[34m[1mwandb[0m: 	max_grad_norm: 7.593603572713287
[34m[1mwandb[0m: 	n_epochs: 9
[34m[1mwandb[0m: 	n_steps: 179
[34m[1mwandb[0m: 	total_timesteps: 56851
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 3719 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 716  |
-----------------------------


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=179 and n_envs=4)


-----------------------------------------
| time/                   |             |
|    fps                  | 2002        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 1432        |
| train/                  |             |
|    approx_kl            | 0.008591862 |
|    clip_fraction        | 0.0307      |
|    clip_range           | 0.251       |
|    entropy_loss         | -0.687      |
|    explained_variance   | -0.00978    |
|    learning_rate        | 0.000497    |
|    loss                 | 6.15        |
|    n_updates            | 9           |
|    policy_gradient_loss | -0.00606    |
|    value_loss           | 22.1        |
-----------------------------------------
Eval num_timesteps=2000, episode_reward=45.80 +/- 6.34
Episode length: 45.80 +/- 6.34
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 45.8        |
|    mean_reward          | 45.8



-----------------------------------------
| time/                   |             |
|    fps                  | 1585        |
|    iterations           | 4           |
|    time_elapsed         | 1           |
|    total_timesteps      | 2864        |
| train/                  |             |
|    approx_kl            | 0.010666304 |
|    clip_fraction        | 0.0693      |
|    clip_range           | 0.251       |
|    entropy_loss         | -0.628      |
|    explained_variance   | 0.0852      |
|    learning_rate        | 0.000497    |
|    loss                 | 4.54        |
|    n_updates            | 27          |
|    policy_gradient_loss | -0.023      |
|    value_loss           | 4.06        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1528        |
|    iterations           | 5           |
|    time_elapsed         | 2           |
|    total_timesteps      | 3580  

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,164.1


[34m[1mwandb[0m: Agent Starting Run: htf8wkb2 with config:
[34m[1mwandb[0m: 	clip_range: 0.22142595193361375
[34m[1mwandb[0m: 	ent_coef: 0.004019576161769689
[34m[1mwandb[0m: 	gae_lambda: 0.9939615430120788
[34m[1mwandb[0m: 	gamma: 0.914968521952961
[34m[1mwandb[0m: 	learning_rate: 0.007455090939542673
[34m[1mwandb[0m: 	max_grad_norm: 0.8560975048503298
[34m[1mwandb[0m: 	n_epochs: 5
[34m[1mwandb[0m: 	n_steps: 363
[34m[1mwandb[0m: 	total_timesteps: 31322
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=363 and n_envs=4)


-----------------------------
| time/              |      |
|    fps             | 3322 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1452 |
-----------------------------




Eval num_timesteps=2000, episode_reward=242.80 +/- 34.03
Episode length: 242.80 +/- 34.03
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 243         |
|    mean_reward          | 243         |
| time/                   |             |
|    total_timesteps      | 2000        |
| train/                  |             |
|    approx_kl            | 0.022166245 |
|    clip_fraction        | 0.213       |
|    clip_range           | 0.221       |
|    entropy_loss         | -0.678      |
|    explained_variance   | -0.00129    |
|    learning_rate        | 0.00746     |
|    loss                 | 0.884       |
|    n_updates            | 5           |
|    policy_gradient_loss | -0.0112     |
|    value_loss           | 7.89        |
-----------------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1806 |
|    iterations      | 2    |
|    time_el

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,334.1


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: lt3ngz64 with config:
[34m[1mwandb[0m: 	clip_range: 0.26826561422371387
[34m[1mwandb[0m: 	ent_coef: 0.00045131284383028103
[34m[1mwandb[0m: 	gae_lambda: 0.9190931991195735
[34m[1mwandb[0m: 	gamma: 0.9425235444069028
[34m[1mwandb[0m: 	learning_rate: 0.009254711792238631
[34m[1mwandb[0m: 	max_grad_norm: 5.925643982366641
[34m[1mwandb[0m: 	n_epochs: 2
[34m[1mwandb[0m: 	n_steps: 357
[34m[1mwandb[0m: 	total_timesteps: 76208
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=357 and n_envs=4)


-----------------------------
| time/              |      |
|    fps             | 1463 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1428 |
-----------------------------




Eval num_timesteps=2000, episode_reward=78.20 +/- 6.49
Episode length: 78.20 +/- 6.49
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 78.2        |
|    mean_reward          | 78.2        |
| time/                   |             |
|    total_timesteps      | 2000        |
| train/                  |             |
|    approx_kl            | 0.025846608 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.268       |
|    entropy_loss         | -0.672      |
|    explained_variance   | -0.00587    |
|    learning_rate        | 0.00925     |
|    loss                 | 0.717       |
|    n_updates            | 2           |
|    policy_gradient_loss | -0.0288     |
|    value_loss           | 7.54        |
-----------------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1191 |
|    iterations      | 2    |
|    time_elapse

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: j4f9y0h0 with config:
[34m[1mwandb[0m: 	clip_range: 0.28679502905891463
[34m[1mwandb[0m: 	ent_coef: 0.003492115143496844
[34m[1mwandb[0m: 	gae_lambda: 0.969664421079512
[34m[1mwandb[0m: 	gamma: 0.9447246576943796
[34m[1mwandb[0m: 	learning_rate: 0.008435304788672422
[34m[1mwandb[0m: 	max_grad_norm: 1.2858542687842636
[34m[1mwandb[0m: 	n_epochs: 6
[34m[1mwandb[0m: 	n_steps: 1078
[34m[1mwandb[0m: 	total_timesteps: 39522
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1078 and n_envs=4)


Eval num_timesteps=2000, episode_reward=9.80 +/- 0.40
Episode length: 9.80 +/- 0.40
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.8      |
|    mean_reward     | 9.8      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------




New best mean reward!
Eval num_timesteps=4000, episode_reward=10.00 +/- 0.63
Episode length: 10.00 +/- 0.63
---------------------------------
| eval/              |          |
|    mean_ep_length  | 10       |
|    mean_reward     | 10       |
| time/              |          |
|    total_timesteps | 4000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 3697 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 4312 |
-----------------------------
Eval num_timesteps=6000, episode_reward=294.20 +/- 111.05
Episode length: 294.20 +/- 111.05
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 294         |
|    mean_reward          | 294         |
| time/                   |             |
|    total_timesteps      | 6000        |
| train/                  |             |
|    approx_kl            | 0.036308

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,492.2


[34m[1mwandb[0m: Agent Starting Run: qfezt17m with config:
[34m[1mwandb[0m: 	clip_range: 0.10160280177355592
[34m[1mwandb[0m: 	ent_coef: 0.004913550107822112
[34m[1mwandb[0m: 	gae_lambda: 0.9267262165397676
[34m[1mwandb[0m: 	gamma: 0.9301180889797372
[34m[1mwandb[0m: 	learning_rate: 0.003544989206473057
[34m[1mwandb[0m: 	max_grad_norm: 7.1837055551636935
[34m[1mwandb[0m: 	n_epochs: 1
[34m[1mwandb[0m: 	n_steps: 480
[34m[1mwandb[0m: 	total_timesteps: 60222
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device




-----------------------------
| time/              |      |
|    fps             | 2734 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1920 |
-----------------------------




Eval num_timesteps=2000, episode_reward=290.00 +/- 20.95
Episode length: 290.00 +/- 20.95
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 290          |
|    mean_reward          | 290          |
| time/                   |              |
|    total_timesteps      | 2000         |
| train/                  |              |
|    approx_kl            | 0.0024749634 |
|    clip_fraction        | 0.13         |
|    clip_range           | 0.102        |
|    entropy_loss         | -0.691       |
|    explained_variance   | 0.00328      |
|    learning_rate        | 0.00354      |
|    loss                 | 1.76         |
|    n_updates            | 1            |
|    policy_gradient_loss | -0.00428     |
|    value_loss           | 14.8         |
------------------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1885 |
|    iterations      | 2

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,488.0


[34m[1mwandb[0m: Agent Starting Run: l5fce2w5 with config:
[34m[1mwandb[0m: 	clip_range: 0.15913465986628403
[34m[1mwandb[0m: 	ent_coef: 0.008312410652083501
[34m[1mwandb[0m: 	gae_lambda: 0.9308009965780228
[34m[1mwandb[0m: 	gamma: 0.9259713195649272
[34m[1mwandb[0m: 	learning_rate: 0.003006980532605324
[34m[1mwandb[0m: 	max_grad_norm: 8.199661090328787
[34m[1mwandb[0m: 	n_epochs: 6
[34m[1mwandb[0m: 	n_steps: 654
[34m[1mwandb[0m: 	total_timesteps: 94998
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=654 and n_envs=4)


Eval num_timesteps=2000, episode_reward=8.80 +/- 0.75
Episode length: 8.80 +/- 0.75
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.8      |
|    mean_reward     | 8.8      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------




New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 3180 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2616 |
-----------------------------
Eval num_timesteps=4000, episode_reward=103.00 +/- 14.44
Episode length: 103.00 +/- 14.44
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 103         |
|    mean_reward          | 103         |
| time/                   |             |
|    total_timesteps      | 4000        |
| train/                  |             |
|    approx_kl            | 0.010460696 |
|    clip_fraction        | 0.229       |
|    clip_range           | 0.159       |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.0114     |
|    learning_rate        | 0.00301     |
|    loss                 | 0.279       |
|    n_updates            | 6           |
|    policy_gradient_loss | -0.0185     |
|    v

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ketp8uju with config:
[34m[1mwandb[0m: 	clip_range: 0.273765101423342
[34m[1mwandb[0m: 	ent_coef: 0.00948908942503587
[34m[1mwandb[0m: 	gae_lambda: 0.9577863453207472
[34m[1mwandb[0m: 	gamma: 0.92706750717221
[34m[1mwandb[0m: 	learning_rate: 0.0030432410046592
[34m[1mwandb[0m: 	max_grad_norm: 6.504317384180399
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	n_steps: 695
[34m[1mwandb[0m: 	total_timesteps: 74874
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=695 and n_envs=4)


Eval num_timesteps=2000, episode_reward=308.80 +/- 98.99
Episode length: 308.80 +/- 98.99
---------------------------------
| eval/              |          |
|    mean_ep_length  | 309      |
|    mean_reward     | 309      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1984 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2780 |
-----------------------------
Eval num_timesteps=4000, episode_reward=305.20 +/- 132.91
Episode length: 305.20 +/- 132.91
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 305        |
|    mean_reward          | 305        |
| time/                   |            |
|    total_timesteps      | 4000       |
| train/                  |            |
|    approx_kl            | 0.02632936 |
|    clip_fraction  

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,350.6


[34m[1mwandb[0m: Agent Starting Run: 2fnqjy2r with config:
[34m[1mwandb[0m: 	clip_range: 0.18351907492643563
[34m[1mwandb[0m: 	ent_coef: 0.005240068358000306
[34m[1mwandb[0m: 	gae_lambda: 0.9145273563275678
[34m[1mwandb[0m: 	gamma: 0.9217249690439528
[34m[1mwandb[0m: 	learning_rate: 0.00789566652468114
[34m[1mwandb[0m: 	max_grad_norm: 1.35229940196693
[34m[1mwandb[0m: 	n_epochs: 2
[34m[1mwandb[0m: 	n_steps: 437
[34m[1mwandb[0m: 	total_timesteps: 13545
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=437 and n_envs=4)


-----------------------------
| time/              |      |
|    fps             | 2768 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1748 |
-----------------------------




Eval num_timesteps=2000, episode_reward=87.20 +/- 9.74
Episode length: 87.20 +/- 9.74
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 87.2        |
|    mean_reward          | 87.2        |
| time/                   |             |
|    total_timesteps      | 2000        |
| train/                  |             |
|    approx_kl            | 0.011893997 |
|    clip_fraction        | 0.228       |
|    clip_range           | 0.184       |
|    entropy_loss         | -0.682      |
|    explained_variance   | 0.0356      |
|    learning_rate        | 0.0079      |
|    loss                 | 0.099       |
|    n_updates            | 2           |
|    policy_gradient_loss | -0.0188     |
|    value_loss           | 4.74        |
-----------------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1997 |
|    iterations      | 2    |
|    time_elapse

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,234.5


[34m[1mwandb[0m: Agent Starting Run: ve275hep with config:
[34m[1mwandb[0m: 	clip_range: 0.12762246823316145
[34m[1mwandb[0m: 	ent_coef: 0.0022087445784009785
[34m[1mwandb[0m: 	gae_lambda: 0.965095559154621
[34m[1mwandb[0m: 	gamma: 0.9718520958571006
[34m[1mwandb[0m: 	learning_rate: 0.00657375422189462
[34m[1mwandb[0m: 	max_grad_norm: 5.906243614399937
[34m[1mwandb[0m: 	n_epochs: 8
[34m[1mwandb[0m: 	n_steps: 1302
[34m[1mwandb[0m: 	total_timesteps: 59091
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1302 and n_envs=4)


Eval num_timesteps=2000, episode_reward=11.80 +/- 1.47
Episode length: 11.80 +/- 1.47
---------------------------------
| eval/              |          |
|    mean_ep_length  | 11.8     |
|    mean_reward     | 11.8     |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------




New best mean reward!
Eval num_timesteps=4000, episode_reward=11.60 +/- 1.36
Episode length: 11.60 +/- 1.36
---------------------------------
| eval/              |          |
|    mean_ep_length  | 11.6     |
|    mean_reward     | 11.6     |
| time/              |          |
|    total_timesteps | 4000     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 3335 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 5208 |
-----------------------------
Eval num_timesteps=6000, episode_reward=227.60 +/- 64.54
Episode length: 227.60 +/- 64.54
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 228          |
|    mean_reward          | 228          |
| time/                   |              |
|    total_timesteps      | 6000         |
| train/                  |              |
|    approx_kl            | 0.0069551235 |
|    clip_

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,433.8


[34m[1mwandb[0m: Agent Starting Run: g9fxmo60 with config:
[34m[1mwandb[0m: 	clip_range: 0.11318092032959608
[34m[1mwandb[0m: 	ent_coef: 0.008298592588749622
[34m[1mwandb[0m: 	gae_lambda: 0.9351013122112336
[34m[1mwandb[0m: 	gamma: 0.925946884007824
[34m[1mwandb[0m: 	learning_rate: 0.002454851484432049
[34m[1mwandb[0m: 	max_grad_norm: 4.494691393776664
[34m[1mwandb[0m: 	n_epochs: 10
[34m[1mwandb[0m: 	n_steps: 897
[34m[1mwandb[0m: 	total_timesteps: 45031
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=897 and n_envs=4)


Eval num_timesteps=2000, episode_reward=9.60 +/- 0.49
Episode length: 9.60 +/- 0.49
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.6      |
|    mean_reward     | 9.6      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------




New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 2865 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 3588 |
-----------------------------
Eval num_timesteps=4000, episode_reward=178.40 +/- 13.50
Episode length: 178.40 +/- 13.50
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 178          |
|    mean_reward          | 178          |
| time/                   |              |
|    total_timesteps      | 4000         |
| train/                  |              |
|    approx_kl            | 0.0052151084 |
|    clip_fraction        | 0.281        |
|    clip_range           | 0.113        |
|    entropy_loss         | -0.689       |
|    explained_variance   | -0.0845      |
|    learning_rate        | 0.00245      |
|    loss                 | 0.183        |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.01

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: aptx9k77 with config:
[34m[1mwandb[0m: 	clip_range: 0.18942248420702665
[34m[1mwandb[0m: 	ent_coef: 0.007734372299321099
[34m[1mwandb[0m: 	gae_lambda: 0.9985021406700644
[34m[1mwandb[0m: 	gamma: 0.983130716988123
[34m[1mwandb[0m: 	learning_rate: 0.006468469170415138
[34m[1mwandb[0m: 	max_grad_norm: 7.488949179445153
[34m[1mwandb[0m: 	n_epochs: 10
[34m[1mwandb[0m: 	n_steps: 941
[34m[1mwandb[0m: 	total_timesteps: 65228
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=941 and n_envs=4)


Eval num_timesteps=2000, episode_reward=9.60 +/- 0.49
Episode length: 9.60 +/- 0.49
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.6      |
|    mean_reward     | 9.6      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------




New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 3385 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 3764 |
-----------------------------
Eval num_timesteps=4000, episode_reward=294.60 +/- 123.28
Episode length: 294.60 +/- 123.28
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 295         |
|    mean_reward          | 295         |
| time/                   |             |
|    total_timesteps      | 4000        |
| train/                  |             |
|    approx_kl            | 0.015596774 |
|    clip_fraction        | 0.332       |
|    clip_range           | 0.189       |
|    entropy_loss         | -0.679      |
|    explained_variance   | 3.1e-05     |
|    learning_rate        | 0.00647     |
|    loss                 | 15.2        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0256     |
|   

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: jyopb7jo with config:
[34m[1mwandb[0m: 	clip_range: 0.18257293913711908
[34m[1mwandb[0m: 	ent_coef: 0.004567995142655914
[34m[1mwandb[0m: 	gae_lambda: 0.9234485169818264
[34m[1mwandb[0m: 	gamma: 0.9269724036588955
[34m[1mwandb[0m: 	learning_rate: 0.009243973278178472
[34m[1mwandb[0m: 	max_grad_norm: 4.043145996792058
[34m[1mwandb[0m: 	n_epochs: 8
[34m[1mwandb[0m: 	n_steps: 1078
[34m[1mwandb[0m: 	total_timesteps: 13981
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1078 and n_envs=4)


Eval num_timesteps=2000, episode_reward=10.00 +/- 0.89
Episode length: 10.00 +/- 0.89
---------------------------------
| eval/              |          |
|    mean_ep_length  | 10       |
|    mean_reward     | 10       |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------




New best mean reward!
Eval num_timesteps=4000, episode_reward=9.60 +/- 0.49
Episode length: 9.60 +/- 0.49
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.6      |
|    mean_reward     | 9.6      |
| time/              |          |
|    total_timesteps | 4000     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 3353 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 4312 |
-----------------------------
Eval num_timesteps=6000, episode_reward=148.00 +/- 46.54
Episode length: 148.00 +/- 46.54
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 148          |
|    mean_reward          | 148          |
| time/                   |              |
|    total_timesteps      | 6000         |
| train/                  |              |
|    approx_kl            | 0.0134493485 |
|    clip_fr

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,427.9


[34m[1mwandb[0m: Agent Starting Run: abqbvt0e with config:
[34m[1mwandb[0m: 	clip_range: 0.2629417284083698
[34m[1mwandb[0m: 	ent_coef: 0.0080704071807796
[34m[1mwandb[0m: 	gae_lambda: 0.9807192781563856
[34m[1mwandb[0m: 	gamma: 0.9665813556395996
[34m[1mwandb[0m: 	learning_rate: 0.00923101118782979
[34m[1mwandb[0m: 	max_grad_norm: 5.4812598804832815
[34m[1mwandb[0m: 	n_epochs: 6
[34m[1mwandb[0m: 	n_steps: 542
[34m[1mwandb[0m: 	total_timesteps: 81263
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=542 and n_envs=4)


Eval num_timesteps=2000, episode_reward=44.40 +/- 15.24
Episode length: 44.40 +/- 15.24
---------------------------------
| eval/              |          |
|    mean_ep_length  | 44.4     |
|    mean_reward     | 44.4     |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 3080 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2168 |
-----------------------------




Eval num_timesteps=4000, episode_reward=111.60 +/- 21.15
Episode length: 111.60 +/- 21.15
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 112         |
|    mean_reward          | 112         |
| time/                   |             |
|    total_timesteps      | 4000        |
| train/                  |             |
|    approx_kl            | 0.030720886 |
|    clip_fraction        | 0.338       |
|    clip_range           | 0.263       |
|    entropy_loss         | -0.666      |
|    explained_variance   | -0.00859    |
|    learning_rate        | 0.00923     |
|    loss                 | 3.16        |
|    n_updates            | 6           |
|    policy_gradient_loss | -0.0399     |
|    value_loss           | 13.7        |
-----------------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 2088 |
|    iterations      | 2    |
|    time_el

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,151.8


[34m[1mwandb[0m: Agent Starting Run: u69qp0wd with config:
[34m[1mwandb[0m: 	clip_range: 0.26596196603759936
[34m[1mwandb[0m: 	ent_coef: 0.004627668821370098
[34m[1mwandb[0m: 	gae_lambda: 0.9123813606025336
[34m[1mwandb[0m: 	gamma: 0.988776311014806
[34m[1mwandb[0m: 	learning_rate: 0.00961163052151704
[34m[1mwandb[0m: 	max_grad_norm: 6.731477170539769
[34m[1mwandb[0m: 	n_epochs: 2
[34m[1mwandb[0m: 	n_steps: 388
[34m[1mwandb[0m: 	total_timesteps: 18049
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=388 and n_envs=4)


-----------------------------
| time/              |      |
|    fps             | 3731 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1552 |
-----------------------------




Eval num_timesteps=2000, episode_reward=264.00 +/- 120.79
Episode length: 264.00 +/- 120.79
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 264         |
|    mean_reward          | 264         |
| time/                   |             |
|    total_timesteps      | 2000        |
| train/                  |             |
|    approx_kl            | 0.023725554 |
|    clip_fraction        | 0.229       |
|    clip_range           | 0.266       |
|    entropy_loss         | -0.67       |
|    explained_variance   | 0.00217     |
|    learning_rate        | 0.00961     |
|    loss                 | 2.39        |
|    n_updates            | 2           |
|    policy_gradient_loss | -0.0274     |
|    value_loss           | 12.2        |
-----------------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 2152 |
|    iterations      | 2    |
|    time_

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: vzmg1gup with config:
[34m[1mwandb[0m: 	clip_range: 0.10153188144231572
[34m[1mwandb[0m: 	ent_coef: 0.007510599381233399
[34m[1mwandb[0m: 	gae_lambda: 0.9761938505946344
[34m[1mwandb[0m: 	gamma: 0.9678233537586322
[34m[1mwandb[0m: 	learning_rate: 0.009191032051214965
[34m[1mwandb[0m: 	max_grad_norm: 4.123543122282363
[34m[1mwandb[0m: 	n_epochs: 8
[34m[1mwandb[0m: 	n_steps: 313
[34m[1mwandb[0m: 	total_timesteps: 78798
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=313 and n_envs=4)


-----------------------------
| time/              |      |
|    fps             | 2610 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1252 |
-----------------------------




Eval num_timesteps=2000, episode_reward=150.60 +/- 34.27
Episode length: 150.60 +/- 34.27
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 151          |
|    mean_reward          | 151          |
| time/                   |              |
|    total_timesteps      | 2000         |
| train/                  |              |
|    approx_kl            | 0.0042414083 |
|    clip_fraction        | 0.296        |
|    clip_range           | 0.102        |
|    entropy_loss         | -0.69        |
|    explained_variance   | 0.00478      |
|    learning_rate        | 0.00919      |
|    loss                 | 3.33         |
|    n_updates            | 8            |
|    policy_gradient_loss | -0.0134      |
|    value_loss           | 16.3         |
------------------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1487 |
|    iterations      | 2

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: 6ovf68u4 with config:
[34m[1mwandb[0m: 	clip_range: 0.24883790651029108
[34m[1mwandb[0m: 	ent_coef: 0.004494595353133154
[34m[1mwandb[0m: 	gae_lambda: 0.9999088872395436
[34m[1mwandb[0m: 	gamma: 0.9178702401180036
[34m[1mwandb[0m: 	learning_rate: 0.008270560254136024
[34m[1mwandb[0m: 	max_grad_norm: 9.786493599771026
[34m[1mwandb[0m: 	n_epochs: 10
[34m[1mwandb[0m: 	n_steps: 1287
[34m[1mwandb[0m: 	total_timesteps: 15858
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1287 and n_envs=4)


Eval num_timesteps=2000, episode_reward=10.20 +/- 0.40
Episode length: 10.20 +/- 0.40
---------------------------------
| eval/              |          |
|    mean_ep_length  | 10.2     |
|    mean_reward     | 10.2     |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------




New best mean reward!
Eval num_timesteps=4000, episode_reward=10.00 +/- 0.00
Episode length: 10.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 10       |
|    mean_reward     | 10       |
| time/              |          |
|    total_timesteps | 4000     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 3317 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 5148 |
-----------------------------
Eval num_timesteps=6000, episode_reward=327.00 +/- 108.17
Episode length: 327.00 +/- 108.17
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 327         |
|    mean_reward          | 327         |
| time/                   |             |
|    total_timesteps      | 6000        |
| train/                  |             |
|    approx_kl            | 0.026757145 |
|    clip_fracti

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: 9lvhih7g with config:
[34m[1mwandb[0m: 	clip_range: 0.2339620347449421
[34m[1mwandb[0m: 	ent_coef: 0.007951593204867432
[34m[1mwandb[0m: 	gae_lambda: 0.96011773201551
[34m[1mwandb[0m: 	gamma: 0.9301124364054616
[34m[1mwandb[0m: 	learning_rate: 0.0023176078361614753
[34m[1mwandb[0m: 	max_grad_norm: 7.335907288409155
[34m[1mwandb[0m: 	n_epochs: 4
[34m[1mwandb[0m: 	n_steps: 774
[34m[1mwandb[0m: 	total_timesteps: 21308
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=774 and n_envs=4)


Eval num_timesteps=2000, episode_reward=8.60 +/- 0.49
Episode length: 8.60 +/- 0.49
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.6      |
|    mean_reward     | 8.6      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------




New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 3534 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 3096 |
-----------------------------
Eval num_timesteps=4000, episode_reward=143.40 +/- 44.71
Episode length: 143.40 +/- 44.71
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 143         |
|    mean_reward          | 143         |
| time/                   |             |
|    total_timesteps      | 4000        |
| train/                  |             |
|    approx_kl            | 0.020733824 |
|    clip_fraction        | 0.201       |
|    clip_range           | 0.234       |
|    entropy_loss         | -0.677      |
|    explained_variance   | -0.0104     |
|    learning_rate        | 0.00232     |
|    loss                 | 1.38        |
|    n_updates            | 4           |
|    policy_gradient_loss | -0.0254     |
|    v

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,494.5


[34m[1mwandb[0m: Agent Starting Run: 6cmd6ony with config:
[34m[1mwandb[0m: 	clip_range: 0.27991576072267776
[34m[1mwandb[0m: 	ent_coef: 0.0078585395978037
[34m[1mwandb[0m: 	gae_lambda: 0.9485517953171548
[34m[1mwandb[0m: 	gamma: 0.9601266208563708
[34m[1mwandb[0m: 	learning_rate: 0.009867358841191297
[34m[1mwandb[0m: 	max_grad_norm: 7.429725433343909
[34m[1mwandb[0m: 	n_epochs: 2
[34m[1mwandb[0m: 	n_steps: 1261
[34m[1mwandb[0m: 	total_timesteps: 98898
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1261 and n_envs=4)


Eval num_timesteps=2000, episode_reward=8.80 +/- 0.98
Episode length: 8.80 +/- 0.98
---------------------------------
| eval/              |          |
|    mean_ep_length  | 8.8      |
|    mean_reward     | 8.8      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------




New best mean reward!
Eval num_timesteps=4000, episode_reward=9.40 +/- 0.49
Episode length: 9.40 +/- 0.49
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.4      |
|    mean_reward     | 9.4      |
| time/              |          |
|    total_timesteps | 4000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 3447 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 5044 |
-----------------------------
Eval num_timesteps=6000, episode_reward=82.80 +/- 14.62
Episode length: 82.80 +/- 14.62
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 82.8       |
|    mean_reward          | 82.8       |
| time/                   |            |
|    total_timesteps      | 6000       |
| train/                  |            |
|    approx_kl            | 0.03305888 |
|    cli

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,500.0


[34m[1mwandb[0m: Agent Starting Run: olbe60x1 with config:
[34m[1mwandb[0m: 	clip_range: 0.20951198176323985
[34m[1mwandb[0m: 	ent_coef: 0.00041650238399878374
[34m[1mwandb[0m: 	gae_lambda: 0.9000154154766176
[34m[1mwandb[0m: 	gamma: 0.9631464580727594
[34m[1mwandb[0m: 	learning_rate: 0.001321431661525495
[34m[1mwandb[0m: 	max_grad_norm: 2.8364356798447186
[34m[1mwandb[0m: 	n_epochs: 9
[34m[1mwandb[0m: 	n_steps: 902
[34m[1mwandb[0m: 	total_timesteps: 96329
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=902 and n_envs=4)


Eval num_timesteps=2000, episode_reward=35.20 +/- 11.20
Episode length: 35.20 +/- 11.20
---------------------------------
| eval/              |          |
|    mean_ep_length  | 35.2     |
|    mean_reward     | 35.2     |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------




New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 3278 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 3608 |
-----------------------------
Eval num_timesteps=4000, episode_reward=372.20 +/- 105.32
Episode length: 372.20 +/- 105.32
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 372         |
|    mean_reward          | 372         |
| time/                   |             |
|    total_timesteps      | 4000        |
| train/                  |             |
|    approx_kl            | 0.015421468 |
|    clip_fraction        | 0.207       |
|    clip_range           | 0.21        |
|    entropy_loss         | -0.68       |
|    explained_variance   | -0.00165    |
|    learning_rate        | 0.00132     |
|    loss                 | 0.206       |
|    n_updates            | 9           |
|    policy_gradient_loss | -0.0217     |
|   

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,489.0


[34m[1mwandb[0m: Agent Starting Run: ilaj49sp with config:
[34m[1mwandb[0m: 	clip_range: 0.2894012100094645
[34m[1mwandb[0m: 	ent_coef: 0.005047333190850075
[34m[1mwandb[0m: 	gae_lambda: 0.947050244754861
[34m[1mwandb[0m: 	gamma: 0.9236209779100624
[34m[1mwandb[0m: 	learning_rate: 0.00786776114966432
[34m[1mwandb[0m: 	max_grad_norm: 1.945041517355801
[34m[1mwandb[0m: 	n_epochs: 10
[34m[1mwandb[0m: 	n_steps: 345
[34m[1mwandb[0m: 	total_timesteps: 49451
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=345 and n_envs=4)


-----------------------------
| time/              |      |
|    fps             | 3301 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1380 |
-----------------------------




Eval num_timesteps=2000, episode_reward=340.00 +/- 111.86
Episode length: 340.00 +/- 111.86
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 340         |
|    mean_reward          | 340         |
| time/                   |             |
|    total_timesteps      | 2000        |
| train/                  |             |
|    approx_kl            | 0.034938835 |
|    clip_fraction        | 0.234       |
|    clip_range           | 0.289       |
|    entropy_loss         | -0.665      |
|    explained_variance   | 0.013       |
|    learning_rate        | 0.00787     |
|    loss                 | 0.573       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0316     |
|    value_loss           | 2.89        |
-----------------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 1316 |
|    iterations      | 2    |
|    time_

0,1
eval/mean_reward,▁

0,1
eval/mean_reward,240.6
