In [1]:
import math
import torch
import random
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym

from torch.utils.tensorboard import SummaryWriter
from torch.distributions.normal import Normal

In [None]:
# Hyperparameters

OBSERVATION_SIZE = 84
NB_FRAMES = 4
NB_ENVS = 4
CUDA = False
SEED = 2307

MAX_EPISODE_LENGTH = 1000 # 12000
LEARNING_RATE = 1e-4
NB_STEPS = 2048
NB_MINIBATCH = 32
TOTAL_TIMESTEPS = 2e6
RUN_NAME = None
LR_SCHEDULING = "Linear"
GAMMA = 0.99
VF_COEF = 0.5

In [3]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


def make_env():
    def thunk():
        env = gym.make("CarRacing-v3", render_mode="rgb_array", lap_complete_percent=0.95, domain_randomize=False, continuous=True, max_episode_steps=MAX_EPISODE_LENGTH)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env = gym.wrappers.ResizeObservation(env, (OBSERVATION_SIZE, OBSERVATION_SIZE))
        env = gym.wrappers.GrayscaleObservation(env)
        env = gym.wrappers.FrameStackObservation(env, NB_FRAMES)
        return env

    return thunk

envs = gym.vector.SyncVectorEnv([make_env() for _ in range(NB_ENVS)])

In [4]:
# Taken from TP5

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, envs, nb_frames, image_size):
        super(Agent, self).__init__()
        stride = [4, 2, 1]
        kernel_size = [8, 4, 3]
        input_channels = [nb_frames, 32, 64]
        output_channels = [32, 64, 64]

        self.output_image_size = math.floor(math.floor(((math.floor(((image_size - 8) / 4) + 1) - 4) / 2) + 1) - 3 + 1)
        
        layers = []
        for i in range(len(stride)):
            layers.append(layer_init(nn.Conv2d(input_channels[i], output_channels[i], kernel_size[i], stride=stride[i])))
            layers.append(nn.ReLU())
            image_size = math.floor(((image_size - kernel_size[i]) / stride[i]) + 1)
        
        layers.append(nn.Flatten())
        layers.append(layer_init(nn.Linear(output_channels[-1] * image_size * image_size, 512)))
        layers.append(nn.ReLU())
        
        self.network = nn.Sequential(*layers)

        self.critic = layer_init(nn.Linear(512, 1), std=1)
        
        self.actor_mean = layer_init(
            nn.Linear(512, np.prod(envs.single_action_space.shape)),
            std=0.01
        )

        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))

    def get_value(self, x):
        return self.critic(self.network(x / 255.0))

    def get_action_and_value(self, x, action=None):
        hidden = self.network(x / 255.0)
        action_mean = self.actor_mean(hidden)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(hidden)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() and CUDA else "cpu")
agent = Agent(envs, NB_FRAMES, OBSERVATION_SIZE).to(device)
optimizer = optim.Adam(agent.parameters(), lr=LEARNING_RATE, eps=1e-5)

obs = torch.zeros((NB_STEPS, NB_ENVS) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((NB_STEPS, NB_ENVS) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((NB_STEPS, NB_ENVS)).to(device)
rewards = torch.zeros((NB_STEPS, NB_ENVS)).to(device)
dones = torch.zeros((NB_STEPS, NB_ENVS)).to(device)
values = torch.zeros((NB_STEPS, NB_ENVS)).to(device)

global_step = 0
next_obs, _ = envs.reset(seed=SEED)
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(NB_ENVS).to(device)

BATCH_SIZE = int(NB_ENVS * NB_STEPS)
MINIBATCH_SIZE = int(BATCH_SIZE // NB_MINIBATCH)
NB_ITERATIONS = int(TOTAL_TIMESTEPS // BATCH_SIZE)

if RUN_NAME != None:
    import wandb

    wandb.init(
        project="Rl2025-project",
        entity="RL Project",
        sync_tensorboard=True,
        # TODO add config to wandb run
        name=RUN_NAME,
        monitor_gym=True,
    )
    writer = SummaryWriter(f"runs/{RUN_NAME}")

  return torch._C._cuda_getDeviceCount() > 0


In [None]:
for iteration in range(1, NB_ITERATIONS + 1):
    # Annealing the rate if instructed to do so.
    if LR_SCHEDULING == "Linear":
        frac = 1.0 - (iteration - 1.0) / NB_ITERATIONS
        lrnow = frac * LEARNING_RATE
        optimizer.param_groups[0]["lr"] = lrnow

    start_time = global_step
    gradient_actor = 0
    gradient_critic = 0
    for step in range(0, NB_STEPS):
        global_step += NB_STEPS
        obs[step] = next_obs
        dones[step] = next_done

        action, logprob, _, value = agent.get_action_and_value(next_obs)
        values[step] = value.flatten()
        actions[step] = action
        logprobs[step] = logprob

        next_obs, reward, terminations, truncations, infos = envs.step(action.cpu().numpy())
        next_done = np.logical_or(terminations, truncations)
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)
        
        if "episode" in infos:
            completed_episodes = infos["_episode"]
            episodic_returns = infos["episode"]["r"][completed_episodes]
            episodic_lengths = infos["episode"]["l"][completed_episodes]

            for episodic_return, episodic_length in zip(episodic_returns, episodic_lengths):
                print(f"global_step={global_step}, episodic_return={episodic_return}")
                if RUN_NAME != None:
                    writer.add_scalar("charts/episodic_return", episodic_return, global_step)
                    writer.add_scalar("charts/episodic_length", episodic_length, global_step)
        
        # Break when one of the environement as reached a terminal state
        if torch.any(next_done):
            break
    
    # Compute the advantages
    R = torch.Tensor([0 if next_done[i] == True else values[-1][i] for i in range(len(next_done))]).to(device)
    optimizer.zero_grad()
    for i in range(step , 0, -1):
        R = rewards[i] + GAMMA * R

        value_loss = (R - values[i])**2
        actor_loss = logprobs[step] * (R - values[i])
        loss = (actor_loss + VF_COEF * value_loss)
        loss.backward()
    optimizer.step()

global_step=2048000, episodic_return=-63.17280453257879
global_step=2048000, episodic_return=-57.74647887324025
global_step=2048000, episodic_return=-53.06859205776256
global_step=2048000, episodic_return=-61.66134185303601


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn