In [1]:
import math
import torch
import random
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym
import os

from torch.utils.tensorboard import SummaryWriter
from torch.distributions.normal import Normal

In [2]:
# Hyperparameters
CONFIG_NUMBER = 4

OBSERVATION_SIZE = 84
NB_FRAMES = 4
NB_ENVS = 1
CUDA = False
SEED = 2307

MAX_EPISODE_LENGTH = 12000
LEARNING_RATE = 1e-4
NB_STEPS = 256
TOTAL_TIMESTEPS = 2e6
RUN_NAME = "Fourth Config A2C (no ent)"
LR_SCHEDULING = "Linear"
GAMMA = 0.99
VF_COEF = 0.5
ENT_COEF = 0.01

In [3]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

def make_env():
    def thunk():
        env = gym.make("CarRacing-v3", render_mode="rgb_array", lap_complete_percent=0.95, domain_randomize=False, continuous=True, max_episode_steps=MAX_EPISODE_LENGTH)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env = gym.wrappers.ResizeObservation(env, (OBSERVATION_SIZE, OBSERVATION_SIZE))
        env = gym.wrappers.GrayscaleObservation(env)
        env = gym.wrappers.FrameStackObservation(env, NB_FRAMES)
        return env

    return thunk

envs = gym.vector.SyncVectorEnv([make_env() for _ in range(NB_ENVS)])

In [4]:
# Taken from TP5

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, envs, nb_frames, image_size):
        super(Agent, self).__init__()
        self.image_size = image_size
        self.nb_frames = nb_frames

        # Actor network
        self.actor_network = self.build_network()
        self.actor_mean = layer_init(
            nn.Linear(512, np.prod(envs.single_action_space.shape)),
            std=0.01
        )
        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))

        # Critic network
        self.critic_network = self.build_network()
        self.critic = layer_init(nn.Linear(512, 1), std=1)

    def build_network(self):
        stride = [4, 2, 1]
        kernel_size = [8, 4, 3]
        input_channels = [self.nb_frames, 32, 64]
        output_channels = [32, 64, 64]
        image_size = self.image_size

        layers = []
        for i in range(len(stride)):
            layers.append(layer_init(nn.Conv2d(input_channels[i], output_channels[i], kernel_size[i], stride=stride[i])))
            layers.append(nn.ReLU())
            image_size = math.floor(((image_size - kernel_size[i]) / stride[i]) + 1)

        layers.append(nn.Flatten())
        layers.append(layer_init(nn.Linear(output_channels[-1] * image_size * image_size, 512)))
        layers.append(nn.ReLU())
        
        return nn.Sequential(*layers)

    def get_value(self, x):
        hidden = self.critic_network(x / 255.0)
        return self.critic(hidden)

    def get_action_and_value(self, x, action=None):
        actor_hidden = self.actor_network(x / 255.0)
        action_mean = self.actor_mean(actor_hidden)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        
        critic_hidden = self.critic_network(x / 255.0)
        value = self.critic(critic_hidden)

        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), value

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() and CUDA else "cpu")
agent = Agent(envs, NB_FRAMES, OBSERVATION_SIZE).to(device)
optimizer = optim.Adam(agent.parameters(), lr=LEARNING_RATE, eps=1e-5)

obs = torch.zeros((NB_STEPS, NB_ENVS) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((NB_STEPS, NB_ENVS) + envs.single_action_space.shape).to(device)
rewards = torch.zeros((NB_STEPS, NB_ENVS)).to(device)
values = torch.zeros((NB_STEPS, NB_ENVS)).to(device)
current_logprobs = torch.zeros((NB_STEPS, NB_ENVS)).to(device)
previous_logprobs = torch.zeros((NB_STEPS, NB_ENVS)).to(device)

global_step = 0
next_obs, _ = envs.reset(seed=SEED)
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(NB_ENVS).to(device)

if RUN_NAME is not None:
    import wandb

    config = {
        "OBSERVATION_SIZE": OBSERVATION_SIZE,
        "NB_FRAMES": NB_FRAMES,
        "NB_ENVS": NB_ENVS,
        "SEED": SEED,
        "MAX_EPISODE_LENGTH": MAX_EPISODE_LENGTH,
        "LEARNING_RATE": LEARNING_RATE,
        "NB_STEPS": NB_STEPS,
        "TOTAL_TIMESTEPS": TOTAL_TIMESTEPS,
        "LR_SCHEDULING": LR_SCHEDULING,
        "GAMMA": GAMMA,
        "VF_COEF": VF_COEF,
        "CUDA": CUDA,
        "ENT_COEF": ENT_COEF,
        "CONFIG_NUMBER": CONFIG_NUMBER
    }

    wandb.init(
        entity="Rl2025-project",
        project="RL Project",
        name=RUN_NAME,
        config=config,
        sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
        monitor_gym=True,       # auto-upload des vidéos de l'agent
        # save_code=True,       # optionnel
    )
    
    writer = SummaryWriter(f"runs/{RUN_NAME}")

In [None]:
while global_step < TOTAL_TIMESTEPS:
    # Annealing the rate if instructed to do so.
    if LR_SCHEDULING == "Linear":
        frac = 1.0 - global_step / TOTAL_TIMESTEPS
        lrnow = frac * LEARNING_RATE
        optimizer.param_groups[0]["lr"] = lrnow

    for step in range(0, NB_STEPS):
        global_step += NB_ENVS
        obs[step] = next_obs

        with torch.no_grad():
            action, logprob, _, value = agent.get_action_and_value(next_obs)
            values[step] = value.flatten()
            current_logprobs[step] = logprob

        actions[step] = action

        next_obs, reward, terminations, truncations, infos = envs.step(action.cpu().numpy())
        next_done = np.logical_or(terminations, truncations)
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)
        
        if "episode" in infos:
            completed_episodes = infos["_episode"]
            episodic_returns = infos["episode"]["r"][completed_episodes]
            episodic_lengths = infos["episode"]["l"][completed_episodes]

            for episodic_return, episodic_length in zip(episodic_returns, episodic_lengths):
                print(f"global_step={global_step}, episodic_return={episodic_return}")
                if RUN_NAME != None:
                    writer.add_scalar("charts/episodic_return", episodic_return, global_step)
                    writer.add_scalar("charts/episodic_length", episodic_length, global_step)
        
        # Break when one of the environement as reached a terminal state
        if torch.any(next_done):
            break
    
    R = torch.Tensor([0 if next_done[i] == True else values[-1][i] for i in range(len(next_done))]).to(device)
    returns = torch.zeros_like(rewards)
    advantages = torch.zeros_like(rewards)

    for i in reversed(range(step)):
        R = rewards[i] + GAMMA * R
        returns[i] = R
        advantages[i] = returns[i] - values[i]

    # Normalize the advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    value_loss = 0
    actor_loss = 0
    entropy_term = 0

    for i in range(step):
        _, logprob, ent, value = agent.get_action_and_value(obs[i], actions[i])
        value = value.flatten()

        actor_loss += -logprob * advantages[i]
        value_loss += (returns[i] - value)**2
        entropy_term += ent

    # TODO add the computation of the entropy, add it to the loss and log the relevant metrics
    loss = (actor_loss + VF_COEF * value_loss).mean()
    approx_kl = (previous_logprobs - current_logprobs).mean() if global_step > NB_ENVS * NB_STEPS else 0.0

    previous_logprobs = current_logprobs.copy()
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # logging for the losses + learning rate
    if RUN_NAME != None:
        writer.add_scalar("losses/total_loss", loss.item(), global_step)
        writer.add_scalar("losses/actor_loss", actor_loss.mean().item(), global_step)
        writer.add_scalar("losses/value_loss", value_loss.mean().item(), global_step)
        writer.add_scalar("losses/entropy", entropy_term.mean().item(), global_step)
        writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
        writer.add_scalar("charts/learning_rate", lrnow, global_step)
    
    
# Save the model at the end of training
save_path = f"trained_models/a2c/a2c_config{CONFIG_NUMBER}.pt"
os.makedirs("trained_models/a2c", exist_ok=True)
torch.save({
    "model_state_dict": agent.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "config": {
        "OBSERVATION_SIZE": OBSERVATION_SIZE,
        "NB_FRAMES": NB_FRAMES,
        "NB_ENVS": NB_ENVS,
        "SEED": SEED,
        "MAX_EPISODE_LENGTH": MAX_EPISODE_LENGTH,
        "LEARNING_RATE": LEARNING_RATE,
        "NB_STEPS": NB_STEPS,
        "TOTAL_TIMESTEPS": TOTAL_TIMESTEPS,
        "LR_SCHEDULING": LR_SCHEDULING,
        "GAMMA": GAMMA,
        "VF_COEF": VF_COEF
    }
}, save_path)
print(f"Model saved to {save_path}")
if RUN_NAME != None:
    wandb.finish()

global_step=6773, episodic_return=-712.0441926346306
global_step=8788, episodic_return=-269.4528662420328
global_step=11135, episodic_return=-299.41228070174725
global_step=23136, episodic_return=-1141.3680781759533
global_step=35137, episodic_return=-1112.8919860628105
global_step=47138, episodic_return=-1162.328767123325
global_step=59139, episodic_return=-1131.596091205283
global_step=71140, episodic_return=-1060.4651162792
global_step=83141, episodic_return=-1183.2214765100866
global_step=95142, episodic_return=-1185.2398523985416
global_step=107143, episodic_return=-1187.8048780487957
global_step=119144, episodic_return=-1183.4983498350027
global_step=131145, episodic_return=-1187.7300613497086
global_step=143146, episodic_return=-1182.5174825175027
global_step=155147, episodic_return=-1181.0606060606276
global_step=167148, episodic_return=-1154.9839228296323
global_step=179149, episodic_return=-1182.5174825175027
global_step=191150, episodic_return=-1143.7500000000541
global_step

KeyboardInterrupt: 