# Continuous Actor Critic Implementation


In [1]:

"""CarRacing-v2 Gym environment"""
import gymnasium as gym
# Importing necessary libraries
import os
from collections import deque
from typing import Any, Tuple

import numpy as np
import torch

from sys import path
import os
if not os.path.exists("neural_networkds"):
    path.append(os.getcwd()+"/../scripts")
from continuous_ac2 import Trainer, ActorCriticAgent, ReplayBuffer

# Setting the seed for reproducibility
torch.manual_seed(0)

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


<torch._C.Generator at 0x7f5a435e09b0>

In [2]:
# Setup device
device = "cpu" # torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the environment
# Passing continuous=True converts the environment to use continuous action.
# The continuous action space has 3 actions: [steering, gas, brake].
env_name: str = "CarRacing-v2"
max_episode_steps = 600  # default
num_episodes = 10

env: gym.Env[Any, Any] = gym.make(
    env_name,
    domain_randomize=True,
    continuous=True,
    render_mode="human",
    max_episode_steps=max_episode_steps,
)

# We first check if state_shape is None. If it is None, we raise a ValueError.
# Otherwise, we access the first element of state_shape using its index and
# using the int() function.
state_shape = env.observation_space.shape

if state_shape is None:
    raise ValueError("Observation space shape is None.")
state_dim = int(state_shape[0])

# Get action spaces
action_space = env.action_space

if isinstance(action_space, gym.spaces.Box):
    action_high = action_space.high
    action_shape = action_space.shape
else:
    raise ValueError("Action space is not of type Box.")
if action_shape is None:
    raise ValueError("Action space shape is None.")

action_dim = int(action_shape[0])
max_action = int(action_high[0])

# Convert from nupy to tensor
low = torch.from_numpy(action_space.low)
high = torch.from_numpy(action_space.high)

# Actor-Critic hyperparameters
gamma = 0.99
lr = 0.0001
value_coef = 0.5
entropy_coef = 0.01
hidden_dim = 256
batch_size = 64

# Location to store training agent model checkpoint
checkpoint_dir = "model_checkpoints"

# Initialize Actor-Critic network
agent = ActorCriticAgent(
    state_dim=state_dim,
    action_dim=action_dim,
    max_action=max_action,
    hidden_dim=hidden_dim,
    gamma=gamma,
    lr=lr,
    value_coef=value_coef,
    entropy_coef=entropy_coef,
    device=device,
)

# Initialize the replay buffer
memory = ReplayBuffer(buffer_size=1024)

# Create trainer to train agent
trainer = Trainer(
    env=env,
    agent=agent,
    memory=memory,
    max_episodes=num_episodes,
    checkpoint_path=checkpoint_dir,
    batch_size=batch_size,
    low=low,
    high=high,
)

trainer.train()
# add this line to close the environment after training
env.close()  # type: ignore

  self.low = torch.tensor(low, device=device)
  self.high = torch.tensor(high, device=device)


Collecting trajectory samples based on random actions.
Episode: 0
Episode 0: Step 0: Total reward = 7.12
Episode 0: Step 1: Total reward = 7.02
Episode 0: Step 2: Total reward = 6.92
Episode 0: Step 3: Total reward = 6.82
Episode 0: Step 4: Total reward = 6.72
Episode 0: Step 5: Total reward = 6.62
Episode 0: Step 6: Total reward = 6.52
Episode 0: Step 7: Total reward = 6.42
Episode 0: Step 8: Total reward = 6.32
Episode 0: Step 9: Total reward = 6.22
Episode 0: Step 10: Total reward = 6.12
Episode 0: Step 11: Total reward = 6.02
Episode 0: Step 12: Total reward = 5.92
Episode 0: Step 13: Total reward = 5.82
Episode 0: Step 14: Total reward = 5.72
Episode 0: Step 15: Total reward = 5.62
Episode 0: Step 16: Total reward = 5.52
Episode 0: Step 17: Total reward = 5.42
Episode 0: Step 18: Total reward = 5.32
Episode 0: Step 19: Total reward = 5.22
Episode 0: Step 20: Total reward = 5.12
Episode 0: Step 21: Total reward = 5.02
Episode 0: Step 22: Total reward = 4.92
Episode 0: Step 23: Tota