In [1]:
import math
import torch
import random
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym
import os

from torch.utils.tensorboard import SummaryWriter
from torch.distributions.normal import Normal

In [2]:
# Hyperparameters
CONFIG_NUMBER = 2

OBSERVATION_SIZE = 84
NB_FRAMES = 4
NB_ENVS = 1
CUDA = False
SEED = 2307

MAX_EPISODE_LENGTH = 12000
LEARNING_RATE = 1e-4
NB_STEPS = 256
TOTAL_TIMESTEPS = 2e6
RUN_NAME = "Second Config A2C"
LR_SCHEDULING = "Linear"
GAMMA = 0.99
VF_COEF = 0.5
ENT_COEF = 0.01

In [3]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

def make_env():
    def thunk():
        env = gym.make("CarRacing-v3", render_mode="rgb_array", lap_complete_percent=0.95, domain_randomize=False, continuous=True, max_episode_steps=MAX_EPISODE_LENGTH)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env = gym.wrappers.ResizeObservation(env, (OBSERVATION_SIZE, OBSERVATION_SIZE))
        env = gym.wrappers.GrayscaleObservation(env)
        env = gym.wrappers.FrameStackObservation(env, NB_FRAMES)
        return env

    return thunk

envs = gym.vector.SyncVectorEnv([make_env() for _ in range(NB_ENVS)])

In [4]:
# Taken from TP5

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, envs, nb_frames, image_size):
        super(Agent, self).__init__()
        stride = [4, 2, 1]
        kernel_size = [8, 4, 3]
        input_channels = [nb_frames, 32, 64]
        output_channels = [32, 64, 64]

        self.output_image_size = math.floor(math.floor(((math.floor(((image_size - 8) / 4) + 1) - 4) / 2) + 1) - 3 + 1)
        
        layers = []
        for i in range(len(stride)):
            layers.append(layer_init(nn.Conv2d(input_channels[i], output_channels[i], kernel_size[i], stride=stride[i])))
            layers.append(nn.ReLU())
            image_size = math.floor(((image_size - kernel_size[i]) / stride[i]) + 1)
        
        layers.append(nn.Flatten())
        layers.append(layer_init(nn.Linear(output_channels[-1] * image_size * image_size, 512)))
        layers.append(nn.ReLU())
        
        self.network = nn.Sequential(*layers)

        self.critic = layer_init(nn.Linear(512, 1), std=1)
        
        self.actor_mean = layer_init(
            nn.Linear(512, np.prod(envs.single_action_space.shape)),
            std=0.01
        )

        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))

    def get_value(self, x):
        return self.critic(self.network(x / 255.0))

    def get_action_and_value(self, x, action=None):
        hidden = self.network(x / 255.0)
        action_mean = self.actor_mean(hidden)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(hidden)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() and CUDA else "cpu")
agent = Agent(envs, NB_FRAMES, OBSERVATION_SIZE).to(device)
optimizer = optim.Adam(agent.parameters(), lr=LEARNING_RATE, eps=1e-5)

obs = torch.zeros((NB_STEPS, NB_ENVS) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((NB_STEPS, NB_ENVS) + envs.single_action_space.shape).to(device)
rewards = torch.zeros((NB_STEPS, NB_ENVS)).to(device)
values = torch.zeros((NB_STEPS, NB_ENVS)).to(device)

global_step = 0
next_obs, _ = envs.reset(seed=SEED)
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(NB_ENVS).to(device)

if RUN_NAME is not None:
    import wandb

    config = {
        "OBSERVATION_SIZE": OBSERVATION_SIZE,
        "NB_FRAMES": NB_FRAMES,
        "NB_ENVS": NB_ENVS,
        "SEED": SEED,
        "MAX_EPISODE_LENGTH": MAX_EPISODE_LENGTH,
        "LEARNING_RATE": LEARNING_RATE,
        "NB_STEPS": NB_STEPS,
        "TOTAL_TIMESTEPS": TOTAL_TIMESTEPS,
        "LR_SCHEDULING": LR_SCHEDULING,
        "GAMMA": GAMMA,
        "VF_COEF": VF_COEF,
        "CUDA": CUDA,
        "ENT_COEF": ENT_COEF,
        "CONFIG_NUMBER": CONFIG_NUMBER
    }

    wandb.init(
        entity="Rl2025-project",
        project="RL Project",
        name=RUN_NAME,
        config=config,
        sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
        monitor_gym=True,       # auto-upload des vidéos de l'agent
        # save_code=True,       # optionnel
    )
    
    writer = SummaryWriter(f"runs/{RUN_NAME}")

  return torch._C._cuda_getDeviceCount() > 0
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mflo230702[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
while global_step < TOTAL_TIMESTEPS:
    # Annealing the rate if instructed to do so.
    if LR_SCHEDULING == "Linear":
        frac = 1.0 - global_step / TOTAL_TIMESTEPS
        lrnow = frac * LEARNING_RATE
        optimizer.param_groups[0]["lr"] = lrnow

    for step in range(0, NB_STEPS):
        global_step += NB_ENVS
        obs[step] = next_obs

        with torch.no_grad():
            action, logprob, _, value = agent.get_action_and_value(next_obs)
            values[step] = value.flatten()
        actions[step] = action

        next_obs, reward, terminations, truncations, infos = envs.step(action.cpu().numpy())
        next_done = np.logical_or(terminations, truncations)
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)
        
        if "episode" in infos:
            completed_episodes = infos["_episode"]
            episodic_returns = infos["episode"]["r"][completed_episodes]
            episodic_lengths = infos["episode"]["l"][completed_episodes]

            for episodic_return, episodic_length in zip(episodic_returns, episodic_lengths):
                print(f"global_step={global_step}, episodic_return={episodic_return}")
                if RUN_NAME != None:
                    writer.add_scalar("charts/episodic_return", episodic_return, global_step)
                    writer.add_scalar("charts/episodic_length", episodic_length, global_step)
        
        # Break when one of the environement as reached a terminal state
        if torch.any(next_done):
            break
    
    R = torch.Tensor([0 if next_done[i] == True else values[-1][i] for i in range(len(next_done))]).to(device)
    value_loss = 0
    actor_loss = 0
    entropy_term = 0
    # TODO check that what's done here is coherant with the slides of lecture 6 (slide 17 and 21): the implementation is based on the paper of A3c https://arxiv.org/pdf/1602.01783
    for i in range(step , 0, -1):
        R = rewards[i] + GAMMA * R

        _ , logprob, ent, value = agent.get_action_and_value(obs[i], actions[i])
        value = value.flatten()
        
        # Reuse here the non_grad values as we don't want to update the value network with the actor loss
        actor_loss += -logprob * (R - values[i])
        value_loss += (R - value)**2
        entropy_term += ent

    # TODO add the computation of the entropy, add it to the loss and log the relevant metrics
    loss = (actor_loss + VF_COEF * value_loss - ENT_COEF * entropy_term).mean()
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # logging for the losses + learning rate
    if RUN_NAME != None:
        writer.add_scalar("losses/total_loss", loss.item(), global_step)
        writer.add_scalar("losses/actor_loss", actor_loss.mean().item(), global_step)
        writer.add_scalar("losses/value_loss", value_loss.mean().item(), global_step)
        writer.add_scalar("losses/entropy", entropy_term.mean().item(), global_step)
        writer.add_scalar("charts/learning_rate", lrnow, global_step)
    
    
# Save the model at the end of training
save_path = f"trained_models/a2c/a2c_config{CONFIG_NUMBER}.pt"
os.makedirs("trained_models/a2c", exist_ok=True)
torch.save({
    "model_state_dict": agent.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "config": {
        "OBSERVATION_SIZE": OBSERVATION_SIZE,
        "NB_FRAMES": NB_FRAMES,
        "NB_ENVS": NB_ENVS,
        "SEED": SEED,
        "MAX_EPISODE_LENGTH": MAX_EPISODE_LENGTH,
        "LEARNING_RATE": LEARNING_RATE,
        "NB_STEPS": NB_STEPS,
        "TOTAL_TIMESTEPS": TOTAL_TIMESTEPS,
        "LR_SCHEDULING": LR_SCHEDULING,
        "GAMMA": GAMMA,
        "VF_COEF": VF_COEF
    }
}, save_path)
print(f"Model saved to {save_path}")
if RUN_NAME != None:
    wandb.finish()

global_step=12000, episodic_return=-1134.8441926346227
global_step=24001, episodic_return=-1078.980891719871
global_step=36002, episodic_return=-1164.9122807017907
global_step=48003, episodic_return=-1131.5960912052876
global_step=60004, episodic_return=-1126.8292682927633
global_step=72005, episodic_return=-1059.589041096038
global_step=84006, episodic_return=-1138.110749185727
global_step=96007, episodic_return=-1070.4318936878408
global_step=108008, episodic_return=-1126.1744966443662
global_step=119757, episodic_return=-1108.6483394835607
global_step=122354, episodic_return=-310.71951219511425
global_step=134355, episodic_return=-962.3762376239381
global_step=141255, episodic_return=-713.1128834356565
global_step=153256, episodic_return=-1133.5664335664983
global_step=165257, episodic_return=-1021.969696969896
global_step=177258, episodic_return=-1071.3826366560731
global_step=188195, episodic_return=-980.213286713451
global_step=190655, episodic_return=-305.17499999999256
global_s

AttributeError: 'float' object has no attribute 'mean'