In [None]:
import gymnasium as gym
import highway_env
import numpy as np
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise

# Create the environment
env = gym.make("highway-v1")
env.unwrapped.configure({
    "action": {"type": "ContinuousAction"},  # Ensure continuous action
})
# Wrap the environment
env.reset()

In [6]:
# Define action noise
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.2 * np.ones(n_actions))

In [None]:
# Create the TD3 model with HER and action noise
model = TD3(
    "MlpPolicy",
    env,
    policy_kwargs=dict(net_arch=[64, 64]),
    action_noise=action_noise,
    learning_rate=0.001,
    buffer_size=100000,
    learning_starts=5000,
    batch_size=64,
    tau=0.01,
    gamma=0.95,
    train_freq=(1, "step"),
    gradient_steps=1,
    verbose=1,
    tensorboard_log="TD3_tensorboard"
)

# Train the model
model.learn(total_timesteps=20000, tb_log_name = 'td3_1')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to TD3_tensorboard\td3_1_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.75     |
|    ep_rew_mean     | 0.229    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 4        |
|    time_elapsed    | 1        |
|    total_timesteps | 7        |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.62     |
|    ep_rew_mean     | 0.218    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 4        |
|    time_elapsed    | 2        |
|    total_timesteps | 13       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.75     |
|    ep_rew_mean     | 0.188    |
| time/              |          |
|    episodes        