In [1]:
import math
import torch
import random
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym
import os

from torch.utils.tensorboard import SummaryWriter
from torch.distributions.normal import Normal

In [None]:
# Hyperparameters
CONFIG_NUMBER = 11

OBSERVATION_SIZE = 64
NB_FRAMES = 4
NB_ENVS = 12
CUDA = False
SEED = 2307

MAX_EPISODE_LENGTH = 12000
LEARNING_RATE = 1e-4
NB_STEPS = 2048
TOTAL_TIMESTEPS = 2e6
RUN_NAME = "Eleventh Config A2C (LSTM 3-256)"
LR_SCHEDULING = "Linear"
GAMMA = 0.99
VF_COEF = 0.5
ENT_COEF = 0.1
MAX_GRAD_NORM = 0.5
GAE_LAMBDA = 0.95

In [3]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

def make_env():
    def thunk():
        env = gym.make("CarRacing-v3", render_mode="rgb_array", lap_complete_percent=0.95, domain_randomize=False, continuous=True, max_episode_steps=MAX_EPISODE_LENGTH)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env = gym.wrappers.ResizeObservation(env, (OBSERVATION_SIZE, OBSERVATION_SIZE))
        env = gym.wrappers.GrayscaleObservation(env)
        env = gym.wrappers.FrameStackObservation(env, NB_FRAMES)
        return env

    return thunk

envs = gym.vector.SyncVectorEnv([make_env() for _ in range(NB_ENVS)])

In [4]:
# Taken from TP5

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, envs, nb_frames, image_size):
        super(Agent, self).__init__()
        self.image_size = image_size
        self.nb_frames = nb_frames

        # Actor network
       # Actor: CNN → LSTM → Linear
        self.actor_cnn, self.actor_lstm, _ = self.build_network(use_lstm=True)
        self.actor_linear = layer_init(nn.Linear(256, np.prod(envs.single_action_space.shape)), std=0.01)
        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))

        # Critic: CNN → Linear
        self.critic_cnn, _, self.critic_linear = self.build_network(use_lstm=False)
        self.critic = layer_init(nn.Linear(512, 1), std=1)


    def build_network(self, use_lstm=False):
        stride = [4, 2, 1]
        kernel_size = [8, 4, 3]
        input_channels = [self.nb_frames, 32, 64]
        output_channels = [32, 64, 64]
        image_size = self.image_size

        cnn_layers = []
        for i in range(len(stride)):
            cnn_layers.append(layer_init(
                nn.Conv2d(input_channels[i], output_channels[i], kernel_size[i], stride=stride[i])
            ))
            cnn_layers.append(nn.Tanh())
            image_size = math.floor(((image_size - kernel_size[i]) / stride[i]) + 1)

        cnn_layers.append(nn.Flatten())
        cnn = nn.Sequential(*cnn_layers)

        # Linear input size = 64 * image_size^2
        linear_input_size = output_channels[-1] * image_size * image_size

        if use_lstm:
            lstm = nn.LSTM(input_size=linear_input_size, hidden_size=256, batch_first=True, num_layers=3)
            linear = None  # handled after LSTM
        else:
            lstm = None
            linear = nn.Sequential(
                layer_init(nn.Linear(linear_input_size, 512)),
                nn.Tanh()
            )

        return cnn, lstm, linear


    def get_value(self, x):
        hidden = self.critic_network(x / 255.0)
        return self.critic(hidden)

    def get_action_and_value(self, x, action=None):
        x = x / 255.0

        # Actor pipeline: CNN → LSTM → Linear
        cnn_out = self.actor_cnn(x)               # (B, flat_dim)
        lstm_in = cnn_out.unsqueeze(1)            # (B, 1, flat_dim)
        lstm_out, _ = self.actor_lstm(lstm_in)    # (B, 1, 512)
        actor_hidden = lstm_out.squeeze(1)        # (B, 512)
        action_mean = self.actor_linear(actor_hidden)

        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()

        # Critic pipeline: CNN → Linear → Value
        critic_hidden = self.critic_linear(self.critic_cnn(x))
        value = self.critic(critic_hidden)

        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), value


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() and CUDA else "cpu")
agent = Agent(envs, NB_FRAMES, OBSERVATION_SIZE).to(device)
optimizer = optim.Adam(agent.parameters(), lr=LEARNING_RATE, eps=1e-5)

obs = torch.zeros((NB_STEPS, NB_ENVS) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((NB_STEPS, NB_ENVS) + envs.single_action_space.shape).to(device)
rewards = torch.zeros((NB_STEPS, NB_ENVS)).to(device)
values = torch.zeros((NB_STEPS, NB_ENVS)).to(device)
current_logprobs = torch.zeros((NB_STEPS, NB_ENVS)).to(device)
previous_logprobs = torch.zeros((NB_STEPS, NB_ENVS)).to(device)

global_step = 0
next_obs, _ = envs.reset(seed=SEED)
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(NB_ENVS).to(device)

if RUN_NAME is not None:
    import wandb

    config = {
        "OBSERVATION_SIZE": OBSERVATION_SIZE,
        "NB_FRAMES": NB_FRAMES,
        "NB_ENVS": NB_ENVS,
        "SEED": SEED,
        "MAX_EPISODE_LENGTH": MAX_EPISODE_LENGTH,
        "LEARNING_RATE": LEARNING_RATE,
        "NB_STEPS": NB_STEPS,
        "TOTAL_TIMESTEPS": TOTAL_TIMESTEPS,
        "LR_SCHEDULING": LR_SCHEDULING,
        "GAMMA": GAMMA,
        "VF_COEF": VF_COEF,
        "CUDA": CUDA,
        "ENT_COEF": ENT_COEF,
        "CONFIG_NUMBER": CONFIG_NUMBER,
        "MAX_GRAD_NORM": MAX_GRAD_NORM,
        "GAE_LAMBDA": GAE_LAMBDA,
    }

    wandb.init(
        entity="Rl2025-project",
        project="RL Project",
        name=RUN_NAME,
        config=config,
        sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
        monitor_gym=True,       # auto-upload des vidéos de l'agent
        # save_code=True,       # optionnel
    )
    
    writer = SummaryWriter(f"runs/{RUN_NAME}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjimmy-walraff02[0m ([33mTFE-proteomics[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
while global_step < TOTAL_TIMESTEPS:
    # Annealing the rate if instructed to do so.
    if LR_SCHEDULING == "Linear":
        frac = 1.0 - global_step / TOTAL_TIMESTEPS
        lrnow = frac * LEARNING_RATE
        optimizer.param_groups[0]["lr"] = lrnow

    for step in range(0, NB_STEPS):
        global_step += NB_ENVS
        obs[step] = next_obs

        with torch.no_grad():
            action, logprob, _, value = agent.get_action_and_value(next_obs)
            values[step] = value.flatten()
            current_logprobs[step] = logprob

        actions[step] = action

        next_obs, reward, terminations, truncations, infos = envs.step(action.cpu().numpy())
        next_done = np.logical_or(terminations, truncations)
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)
        
        if "episode" in infos:
            completed_episodes = infos["_episode"]
            episodic_returns = infos["episode"]["r"][completed_episodes]
            episodic_lengths = infos["episode"]["l"][completed_episodes]

            for episodic_return, episodic_length in zip(episodic_returns, episodic_lengths):
                print(f"global_step={global_step}, episodic_return={episodic_return}")
                if RUN_NAME != None:
                    writer.add_scalar("charts/episodic_return", episodic_return, global_step)
                    writer.add_scalar("charts/episodic_length", episodic_length, global_step)
        
        # Break when one of the environement as reached a terminal state
        if torch.any(next_done):
            break
    
    # R = torch.Tensor([0 if next_done[i] == True else values[-1][i] for i in range(len(next_done))]).to(device)
    # returns = torch.zeros_like(rewards)
    # advantages = torch.zeros_like(rewards)

    # for i in reversed(range(step)):
    #     R = rewards[i] + GAMMA * R
    #     returns[i] = R
    #     advantages[i] = returns[i] - values[i]

    next_value = agent.get_value(next_obs).reshape(1, -1)
    advantages = torch.zeros_like(rewards)
    lastgaelam = 0

    for t in reversed(range(step)):
        if t == step - 1:
            nextnonterminal = 1.0 - next_done
            nextvalue = next_value
        else:
            nextnonterminal = 1.0 - torch.zeros_like(next_done)
            nextvalue = values[t + 1]
        
        delta = rewards[t] + GAMMA * nextvalue * nextnonterminal - values[t]
        advantages[t] = lastgaelam = delta + GAMMA * GAE_LAMBDA * nextnonterminal * lastgaelam

    returns = advantages + values

    # Normalize the advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    value_loss = torch.zeros((step, NB_ENVS))
    actor_loss = torch.zeros((step, NB_ENVS))
    entropy_term = torch.zeros((step, NB_ENVS))

    for i in range(step):
        _, logprob, ent, value = agent.get_action_and_value(obs[i], actions[i])
        value = value.flatten()

        actor_loss[i] = -logprob * advantages[i]
        value_loss[i] = (returns[i] - value)**2
        entropy_term[i] = ent

    actor_loss = actor_loss.mean()
    value_loss = 0.5 * value_loss.mean()
    entropy_term = entropy_term.mean()

    loss = actor_loss + VF_COEF * value_loss - ENT_COEF * entropy_term
    approx_kl = (previous_logprobs - current_logprobs).mean() if global_step > NB_ENVS * NB_STEPS else 0.0

    previous_logprobs = current_logprobs.detach().clone()
    
    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(agent.parameters(), MAX_GRAD_NORM)
    optimizer.step()

    # logging for the losses + learning rate
    if RUN_NAME != None:
        writer.add_scalar("losses/total_loss", loss.item(), global_step)
        writer.add_scalar("losses/actor_loss", actor_loss.mean().item(), global_step)
        writer.add_scalar("losses/value_loss", value_loss.mean().item(), global_step)
        writer.add_scalar("losses/entropy", entropy_term.mean().item(), global_step)
        writer.add_scalar("losses/approx_kl", approx_kl, global_step)
        writer.add_scalar("charts/learning_rate", lrnow, global_step)
    
    
# Save the model at the end of training
save_path = f"trained_models/a2c/a2c_config{CONFIG_NUMBER}.pt"
os.makedirs("trained_models/a2c", exist_ok=True)
torch.save({
    "model_state_dict": agent.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "config": {
        "OBSERVATION_SIZE": OBSERVATION_SIZE,
        "NB_FRAMES": NB_FRAMES,
        "NB_ENVS": NB_ENVS,
        "SEED": SEED,
        "MAX_EPISODE_LENGTH": MAX_EPISODE_LENGTH,
        "LEARNING_RATE": LEARNING_RATE,
        "NB_STEPS": NB_STEPS,
        "TOTAL_TIMESTEPS": TOTAL_TIMESTEPS,
        "LR_SCHEDULING": LR_SCHEDULING,
        "GAMMA": GAMMA,
        "VF_COEF": VF_COEF,
        "MAX_GRAD_NORM": MAX_GRAD_NORM,
        "ENT_COEF": ENT_COEF,
        "GAE_LAMBDA": GAE_LAMBDA,
    }
}, save_path)
print(f"Model saved to {save_path}")
if RUN_NAME != None:
    wandb.finish()

global_step=82044, episodic_return=-725.5354838710409
global_step=83424, episodic_return=-679.157971014553
global_step=86976, episodic_return=-738.6072847682868
global_step=87276, episodic_return=-750.2769230770022
global_step=108768, episodic_return=-931.0311827958186
global_step=126324, episodic_return=-1082.1774647888867
global_step=144000, episodic_return=-1137.6770538244223
global_step=144000, episodic_return=-1116.9675090253497
global_step=144000, episodic_return=-1136.102236421787
global_step=144000, episodic_return=-1128.571428571497
global_step=144000, episodic_return=-1116.3346613546607
global_step=144000, episodic_return=-1123.8754325260236
global_step=159528, episodic_return=-584.9460750853628
global_step=187716, episodic_return=-868.772789115752
global_step=226056, episodic_return=-1120.4545454546217
global_step=227436, episodic_return=-1139.209726443828
global_step=234432, episodic_return=-720.7835125448729
global_step=243096, episodic_return=-844.2440677967095
global_ste

0,1
charts/episodic_length,▂▂▂█▁▄██▅█████▄█▆▂█▂▄▃██████▃█▅█▁██▅▄███
charts/episodic_return,▆▆▄▂▅▂▁▄▂▄▄▂▂▂▆▂▂▂▇▂▂▆▂▂▂▂▂▂▂▂▂▂▅▆▂▁▂█▆▂
charts/learning_rate,██▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▁
global_step,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇███
losses/actor_loss,▃▂▁▂█▃▄▃▃▃▇▃▃▃▃▂▃▃▃▃▃▃▃▃▃▂▃█▃▃▃▃▂▃▃▃▃▃▃▃
losses/approx_kl,▅▄█▇▄▅▅▅▅▄▆▅▆▅▄▅▅▅▃▅▄▅█▄▅▆▄▅▅▁▄▅▃▄▆▆▄▅▄▅
losses/entropy,▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████
losses/total_loss,▅▆▄▁▂▄▃▃▃▃█▃▅▃▃▅▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▄▃▃▃▃▃▃▃▃
losses/value_loss,██▁▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
charts/episodic_length,12000.0
charts/episodic_return,-1126.2821
charts/learning_rate,0.0
global_step,2004888.0
losses/actor_loss,0.07486
losses/approx_kl,-0.00389
losses/entropy,4.27304
losses/total_loss,-0.12563
losses/value_loss,0.45363
