In [1]:
import gymnasium as gym
import psutil
import gc
from stable_baselines3.sac.policies import MlpPolicy
from stable_baselines3 import SAC

def monitor_memory():
    process = psutil.Process()
    return process.memory_info().rss / (1024 ** 2)  # Memory usage in MB

env = gym.make("CarRacing-v2", render_mode="human")

model = SAC(
    MlpPolicy, env, verbose=1, buffer_size=10000, learning_starts=1000,
    batch_size=64, learning_rate=0.0003, gamma=0.99, tau=0.005,
    ent_coef="auto", target_update_interval=1, gradient_steps=1,
    action_noise=None, optimize_memory_usage=False, policy_kwargs=None,
    device='auto', _init_setup_model=True
)

# Perform learning with memory monitoring
total_timesteps = 2000
log_interval = 25

for step in range(0, total_timesteps, log_interval):
    model.learn(total_timesteps=log_interval, reset_num_timesteps=False)
    mem_usage = monitor_memory()
    print(f"Step {step + log_interval}: Memory usage: {mem_usage} MB")
    
    # Force garbage collection to manage memory
    gc.collect()

model.save("sac_car_racing")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Step 25: Memory usage: 465.25390625 MB
Step 50: Memory usage: 466.5625 MB
Step 75: Memory usage: 467.90234375 MB
Step 100: Memory usage: 469.26171875 MB
Step 125: Memory usage: 470.57421875 MB
Step 150: Memory usage: 471.89453125 MB
Step 175: Memory usage: 473.2109375 MB
Step 200: Memory usage: 474.5234375 MB
Step 225: Memory usage: 475.8359375 MB
Step 250: Memory usage: 477.2109375 MB
Step 275: Memory usage: 478.47265625 MB
Step 300: Memory usage: 479.84765625 MB
Step 325: Memory usage: 481.09765625 MB
Step 350: Memory usage: 482.57421875 MB
Step 375: Memory usage: 483.89453125 MB
Step 400: Memory usage: 485.21484375 MB
Step 425: Memory usage: 486.52734375 MB
Step 450: Memory usage: 487.83984375 MB
Step 475: Memory usage: 489.171875 MB
Step 500: Memory usage: 490.48828125 MB
Step 525: Memory usage: 491.86328125 MB
Step 550: Memory usage: 493.11328125 M

KeyboardInterrupt: 

In [None]:
env = gym.make("CarRacing-v2", render_mode="human")

model = SAC.load("sac_car_racing")

obs, info = env.reset()
while True:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        obs, info = env.reset()



KeyboardInterrupt: 

In [3]:
LR = 0.0003
GAMMA = 0.99
TAU = 0.005
BATCH_SIZE = 128

In [7]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Create the CarRacing-v2 environment
env = gym.make('CarRacing-v2')

# Wrap the environment
env = DummyVecEnv([lambda: env])

# Create the PPO model
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log="./ppo_car_racing_tensorboard/")

# Train the model
model.learn(total_timesteps=2000, log_interval=4)

# Save the model
model.save("ppo_car_racing_v2")

# To use the model later, load it with:
# model = PPO.load("ppo_car_racing_v2")


Using cpu device
Wrapping the env in a VecTransposeImage.
Logging to ./ppo_car_racing_tensorboard/PPO_2


In [3]:
LR = 0.0003
GAMMA = 0.99
TAU = 0.005

In [2]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor
import numpy as np


# Create the CarRacing-v2 environment
env = gym.make('CarRacing-v2')
# env = Monitor(env)  # To log episode rewards

# Wrap the environment
# env = DummyVecEnv([lambda: env])

# Create the PPO model
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log="./ppo_car_racing_tensorboard/", n_steps=256, learning_rate=LR, gamma=GAMMA, tau=TAU)

# Train the model with the custom callback
model.learn(total_timesteps=100_000, tb_log_name="Experiment_1", progress_bar=True, log_interval=1)

# Save the model
model.save("ppo_car_racing_v2")

# To use the model later, load it with:
# model = PPO.load("ppo_car_racing_v2")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./ppo_car_racing_tensorboard/Test_1_7


Output()

----------------------------
| time/              |     |
|    fps             | 37  |
|    iterations      | 1   |
|    time_elapsed    | 3   |
|    total_timesteps | 128 |
----------------------------


----------------------------------------
| time/                   |            |
|    fps                  | 34         |
|    iterations           | 2          |
|    time_elapsed         | 7          |
|    total_timesteps      | 256        |
| train/                  |            |
|    approx_kl            | 0.01339897 |
|    clip_fraction        | 0.102      |
|    clip_range           | 0.2        |
|    entropy_loss         | -4.25      |
|    explained_variance   | -0.00327   |
|    learning_rate        | 0.0003     |
|    loss                 | 0.374      |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0121    |
|    std                  | 0.998      |
|    value_loss           | 1.25       |
----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 35          |
|    iterations           | 3           |
|    time_elapsed         | 10          |
|    total_timesteps      | 384         |
| train/                  |             |
|    approx_kl            | 0.011698784 |
|    clip_fraction        | 0.0859      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.25       |
|    explained_variance   | -0.0205     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.273       |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.00726    |
|    std                  | 0.996       |
|    value_loss           | 0.721       |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 35          |
|    iterations           | 4           |
|    time_elapsed         | 14          |
|    total_timesteps      | 512         |
| train/                  |             |
|    approx_kl            | 0.018160833 |
|    clip_fraction        | 0.138       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.24       |
|    explained_variance   | -0.0384     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.399       |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.011      |
|    std                  | 0.995       |
|    value_loss           | 1.08        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 33          |
|    iterations           | 5           |
|    time_elapsed         | 19          |
|    total_timesteps      | 640         |
| train/                  |             |
|    approx_kl            | 0.015161884 |
|    clip_fraction        | 0.0695      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.24       |
|    explained_variance   | 0.00694     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.486       |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.00839    |
|    std                  | 0.995       |
|    value_loss           | 1.15        |
-----------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 33          |
|    iterations           | 6           |
|    time_elapsed         | 22          |
|    total_timesteps      | 768         |
| train/                  |             |
|    approx_kl            | 0.005453637 |
|    clip_fraction        | 0.0453      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.24       |
|    explained_variance   | 0.0485      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.244       |
|    n_updates            | 50          |
|    policy_gradient_loss | 0.00135     |
|    std                  | 0.995       |
|    value_loss           | 0.724       |
-----------------------------------------


----------------------------------------
| time/                   |            |
|    fps                  | 33         |
|    iterations           | 7          |
|    time_elapsed         | 26         |
|    total_timesteps      | 896        |
| train/                  |            |
|    approx_kl            | 0.01607234 |
|    clip_fraction        | 0.102      |
|    clip_range           | 0.2        |
|    entropy_loss         | -4.24      |
|    explained_variance   | 0.00292    |
|    learning_rate        | 0.0003     |
|    loss                 | 0.28       |
|    n_updates            | 60         |
|    policy_gradient_loss | -0.00301   |
|    std                  | 0.995      |
|    value_loss           | 0.79       |
----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1e+03        |
|    ep_rew_mean          | -47          |
| time/                   |              |
|    fps                  | 33           |
|    iterations           | 8            |
|    time_elapsed         | 30           |
|    total_timesteps      | 1024         |
| train/                  |              |
|    approx_kl            | 0.0038284832 |
|    clip_fraction        | 0.106        |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.24        |
|    explained_variance   | 0.0535       |
|    learning_rate        | 0.0003       |
|    loss                 | 0.144        |
|    n_updates            | 70           |
|    policy_gradient_loss | -0.00497     |
|    std                  | 0.994        |
|    value_loss           | 0.614        |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -47         |
| time/                   |             |
|    fps                  | 33          |
|    iterations           | 9           |
|    time_elapsed         | 34          |
|    total_timesteps      | 1152        |
| train/                  |             |
|    approx_kl            | 0.013005823 |
|    clip_fraction        | 0.11        |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.24       |
|    explained_variance   | 0.36        |
|    learning_rate        | 0.0003      |
|    loss                 | 0.11        |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.0104     |
|    std                  | 0.993       |
|    value_loss           | 0.52        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -47         |
| time/                   |             |
|    fps                  | 32          |
|    iterations           | 10          |
|    time_elapsed         | 38          |
|    total_timesteps      | 1280        |
| train/                  |             |
|    approx_kl            | 0.012856516 |
|    clip_fraction        | 0.148       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.23       |
|    explained_variance   | -0.363      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.241       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0064     |
|    std                  | 0.991       |
|    value_loss           | 0.719       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -47         |
| time/                   |             |
|    fps                  | 32          |
|    iterations           | 11          |
|    time_elapsed         | 42          |
|    total_timesteps      | 1408        |
| train/                  |             |
|    approx_kl            | 0.011434725 |
|    clip_fraction        | 0.174       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.23       |
|    explained_variance   | -0.00565    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.278       |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.00543    |
|    std                  | 0.991       |
|    value_loss           | 0.855       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -47         |
| time/                   |             |
|    fps                  | 33          |
|    iterations           | 12          |
|    time_elapsed         | 46          |
|    total_timesteps      | 1536        |
| train/                  |             |
|    approx_kl            | 0.011789855 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.23       |
|    explained_variance   | 0.114       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0864      |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.00945    |
|    std                  | 0.991       |
|    value_loss           | 0.544       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -47         |
| time/                   |             |
|    fps                  | 32          |
|    iterations           | 13          |
|    time_elapsed         | 50          |
|    total_timesteps      | 1664        |
| train/                  |             |
|    approx_kl            | 0.026126966 |
|    clip_fraction        | 0.387       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.23       |
|    explained_variance   | -1.74       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0561      |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0239     |
|    std                  | 0.989       |
|    value_loss           | 0.561       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -47         |
| time/                   |             |
|    fps                  | 32          |
|    iterations           | 14          |
|    time_elapsed         | 55          |
|    total_timesteps      | 1792        |
| train/                  |             |
|    approx_kl            | 0.017537437 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.22       |
|    explained_variance   | 0.00872     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00723     |
|    n_updates            | 130         |
|    policy_gradient_loss | -0.00266    |
|    std                  | 0.988       |
|    value_loss           | 0.198       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -47         |
| time/                   |             |
|    fps                  | 32          |
|    iterations           | 15          |
|    time_elapsed         | 59          |
|    total_timesteps      | 1920        |
| train/                  |             |
|    approx_kl            | 0.001999001 |
|    clip_fraction        | 0.0648      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.22       |
|    explained_variance   | -0.758      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0126      |
|    n_updates            | 140         |
|    policy_gradient_loss | -0.000732   |
|    std                  | 0.988       |
|    value_loss           | 0.204       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -59.3       |
| time/                   |             |
|    fps                  | 32          |
|    iterations           | 16          |
|    time_elapsed         | 63          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.014983116 |
|    clip_fraction        | 0.129       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.22       |
|    explained_variance   | -0.515      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00689     |
|    n_updates            | 150         |
|    policy_gradient_loss | -0.00459    |
|    std                  | 0.988       |
|    value_loss           | 0.105       |
-----------------------------------------


In [13]:
from stable_baselines3 import A2C

model = A2C("MlpPolicy", "CartPole-v1", verbose=1, tensorboard_log="./ppo_car_racing_tensorboard/")
model.learn(total_timesteps=10_000)

Using cpu device
Creating environment from the given name 'CartPole-v1'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./ppo_car_racing_tensorboard/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 26.9     |
|    ep_rew_mean        | 26.9     |
| time/                 |          |
|    fps                | 553      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.691   |
|    explained_variance | 0.056    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.78     |
|    value_loss         | 9.03     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 29.8     |
|    ep_rew_mean        | 29.8     |
| time/                 |   

<stable_baselines3.a2c.a2c.A2C at 0x238141cdc10>