In [3]:
import time
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
from torch.distributions.categorical import Categorical
import copy

In [64]:
# FROM TP5
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer
"""
class Agent(nn.Module):
    def __init__(self, env):
        super().__init__()
        
        obs_dim = int(np.prod(env.observation_space.shape))
        
        action_dim = env.action_space.shape[0]
        
        
        self.actor = nn.Sequential(
            layer_init(nn.Linear(obs_dim, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, action_dim), std=0.01),
        )
        
        self.log_std = nn.Parameter(torch.zeros(action_dim))

    def transform_action(self, action):
        # Permet de garder les bounds du action space correctes
        transformed = action.clone()
        # Pour le steering
        transformed[..., 0] = torch.tanh(transformed[..., 0])
        # Pour le gas et le brake, si présents
        if transformed.shape[-1] > 1:
            transformed[..., 1:] = torch.sigmoid(transformed[..., 1:])
        return transformed
    
    def get_dist(self, obs):
        if obs.dim() > 2:
            obs = obs.view(1, -1)

        # Calcul de la moyenne
        mean = self.actor(obs)
        # Calcul de l'écart type (on exponentie log_std)
        std = torch.exp(self.log_std).expand_as(mean)

        # Création de la distribution Gaussienne 
        dist = Normal(mean, std)
        
        return dist
        
    def get_action(self, obs, action=None):
        dist = self.get_dist(obs)

        if action is None:
            action = dist.sample()   # Échantillonne une action

        # Calcul de la log-probabilité et de l'entropie
        log_prob = dist.log_prob(action).sum(dim=-1)  # somme sur chaque dimension d'action
        entropy = dist.entropy().sum(dim=-1)

        transformed_action = self.transform_action(action)

        return transformed_action.squeeze(), log_prob, entropy
"""  
def print_shape(x):
    print(x.shape)
    return x
class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        
        self.actor_mean = nn.Sequential(
            layer_init(nn.Conv2d(1, 32, 8, stride=4)),
            nn.ReLU(),
            layer_init(nn.Conv2d(32, 64, 4, stride=2)),
            nn.ReLU(),
            layer_init(nn.Conv2d(64, 64, 3, stride=1)),
            nn.ReLU(),
            nn.Flatten(),
            layer_init(nn.Linear(64 * 7 * 7, 512)),
            nn.ReLU(),
            layer_init(nn.Linear(512, np.prod(envs.single_action_space.shape)), std=0.01)
        )

        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))

    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1)


# ANCIEN

# NEW

In [None]:
LEARNING_RATE = 3e-4
NUM_EPOCH = 10
N_ITERATION = 10
N_STEPS = 128
N_SAMPLE = 64
BATCH_SIZE = 64
EPSILON = 0.2
BETA = 1.0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = gym.make(
            "CarRacing-v3",
            continuous=True,
            lap_complete_percent=0.95,
            domain_randomize=False,
            render_mode="rgb_array"
        )

print("Observation space", env.observation_space.shape)
print("Action space", env.action_space.shape)

# GRPO algorithm: https://www.youtube.com/watch?v=YCawyzAOg1Y
# Inspired from: https://gist.github.com/infoslack/f0e0aec9a882c2c76e1dc1bdd510f279
# Inspired from: https://github.com/XinJingHao/DRL-Pytorch/tree/main
agent = Agent(env).to(device)
optimizer = optim.Adam(agent.parameters(), lr=LEARNING_RATE)

actions = torch.zeros(N_SAMPLE, N_STEPS, env.action_space.shape[0]).to(device)
log_probs = torch.zeros(N_SAMPLE, N_STEPS).to(device)
entropies = torch.zeros(N_SAMPLE, N_STEPS).to(device)
rewards = torch.zeros(N_SAMPLE, N_STEPS).to(device)
next_observations = torch.zeros((N_SAMPLE, N_STEPS) + env.observation_space.shape, device=device)
dones = torch.zeros(N_SAMPLE, N_STEPS).to(device)
advantages = torch.zeros(N_SAMPLE, N_STEPS).to(device)

global_step = 0
start_time = time.time()
observation, info = env.reset()
observation = torch.tensor(observation, dtype=torch.float32).to(device)
print("Observation shape", observation.shape)

for iter in range(1, N_ITERATION + 1):
    print("New iteration")
    for step in range(0, N_STEPS):
        global_step += 1

        with torch.no_grad():
            # Boucle pour échantillonner N_SAMPLE actions => on a plus le critic model
            for i in range(N_SAMPLE):
                action, log_prob, entropy = agent.get_action(observation)
                next_observation, reward, terminated, truncated, info = env.step(action.cpu().numpy())
                done = terminated or truncated

                rewards[i, step] = reward
                next_observations[i, step] = torch.tensor(next_observation, dtype=torch.float32)
                dones[i, step] = done
                actions[i, step] = action
                log_probs[i, step] = log_prob
                entropies[i, step] = entropy

        
        mean_rewards = rewards[:, step].mean()

        std_rewards = rewards[:, step].std() + 1e-8

        advantages[:, step] = (rewards[:, step] - mean_rewards) / std_rewards  # advantages formula => Ait = (Ri - mean(R)) / std(R) => paper
        break
    break

    # IL FAUT MTN LOOP SUR LES EPOCHS CALCULER LES RATIO / CLIPRATIO / KL ETC PUIS UPDATE LE MODELE
        


Observation space (96, 96, 3)
Action space (3,)


NameError: name 'Agent' is not defined

In [41]:
class CarRacingRewardPredictor(gym.Wrapper):
    env = None
    def __init__(self, env):
        super().__init__(env)
        self.env = env
    
    def create_state_dict(self):
        unwrapped_env = self.env.unwrapped

        hull = unwrapped_env.car.hull
        car_state = {
            "pos":    hull.position.copy(),
            "angle":  hull.angle,
            "linvel": hull.linearVelocity.copy(),
            "angvel": hull.angularVelocity,
        }

        wheel_states = []
        for wheel in unwrapped_env.car.wheels:
            wheel_states.append({
                "pos":    wheel.position.copy(),
                "angle":  wheel.angle,
                "linvel": wheel.linearVelocity.copy(),
                "angvel": wheel.angularVelocity,
                "omega":  wheel.omega,
                "phase":  wheel.phase,
                "tiles":  set(wheel.tiles),
            })

        tile_flags = [bool(t.road_visited) for t in unwrapped_env.road]

        general_info = {
            "tile_count":  unwrapped_env.tile_visited_count,
            "cum_reward":  unwrapped_env.reward,
            "prev_reward": unwrapped_env.prev_reward,
            "time":        unwrapped_env.t,
        }

        return {"car": car_state, "wheels": wheel_states, "tiles": tile_flags, "general_info": general_info}

    def restore_state(self, state_dict):
        unwrapped_env = self.env.unwrapped

        hull = unwrapped_env.car.hull
        
        hull.position = state_dict["car"]["pos"]
        hull.angle = state_dict["car"]["angle"]
        hull.linearVelocity = state_dict["car"]["linvel"]
        hull.angularVelocity = state_dict["car"]["angvel"]

        for wheel, wheel_state in zip(unwrapped_env.car.wheels, state_dict["wheels"]):
            wheel.position = wheel_state["pos"]
            wheel.angle = wheel_state["angle"]
            wheel.linearVelocity = wheel_state["linvel"]
            wheel.angularVelocity = wheel_state["angvel"]
            wheel.omega = wheel_state["omega"]
            wheel.phase = wheel_state["phase"]
            wheel.tiles.clear()
            for edge in wheel.contacts:
                other_body = edge.other
                ud = getattr(other_body, 'userData', None)
                if ud and hasattr(ud, 'road_friction'):
                    wheel.tiles.add(ud)
                    

        for tile, tile_flag in zip(unwrapped_env.road, state_dict["tiles"]):
            tile.road_visited = tile_flag

        unwrapped_env.tile_visited_count = state_dict["general_info"]["tile_count"]
        unwrapped_env.reward = state_dict["general_info"]["cum_reward"]
        unwrapped_env.prev_reward = state_dict["general_info"]["prev_reward"]
        unwrapped_env.t = state_dict["general_info"]["time"]
    
    def compute_single_reward(self, action):
        """Compute the reward of the given action

        Args:
            action (np.array): the action to perform

        Returns:
            float, bool, bool: reward, done flag, truncated flag
        """
        state_dict = self.create_state_dict()
        _, reward, done, truncated, _ = self.env.step(action)
        self.restore_state(state_dict)
        return reward, done, truncated
    
    def compute_rewards(self, actions):
        """Given the list of actions, return the associated reward of the given actions

        Args:
            actions (iterable): actions to perform

        Returns:
            List[float]: index i contain the reward of the action at index i in actions
        """
        
        rewards = []
        state_dict = self.create_state_dict()
        for action in actions:
            _, reward, _, _, _ = self.env.step(action)
            self.restore_state(state_dict)
            rewards.append(reward)
        
        return rewards

In [59]:
from gymnasium.utils.save_video import capped_cubic_video_schedule

LEARNING_RATE = 3e-4
N_ITERATION   = 10
N_STEPS       = 128
N_SAMPLE      = 2
NUM_EPOCH     = 10
EPSILON       = 0.2
BETA          = 1.0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def make_env(capture_video, run_name):
    if capture_video:
        env = gym.make(
            "CarRacing-v3",
            continuous=True,
            lap_complete_percent=0.95,
            domain_randomize=False,
            render_mode="rgb_array"
        )
        env = gym.wrappers.RecordVideo(env, video_folder=f"videos/{run_name}", name_prefix="eval",
                episode_trigger=capped_cubic_video_schedule)
    else:
        env = gym.make(
        "CarRacing-v3",
        continuous=True,
        lap_complete_percent=0.95,
        domain_randomize=False,
        render_mode="rgb_array"
    )

    env = gym.wrappers.ResizeObservation(env, (84, 84))
    env = gym.wrappers.GrayscaleObservation(env)
    env = gym.wrappers.FrameStackObservation(env, 1)
    env = CarRacingRewardPredictor(env)
    
    return env

env = make_env(False, "")


# GRPO algorithm: https://www.youtube.com/watch?v=YCawyzAOg1Y
# Inspired from: https://gist.github.com/infoslack/f0e0aec9a882c2c76e1dc1bdd510f279
# Inspired from: https://github.com/XinJingHao/DRL-Pytorch/tree/main
agent      = Agent(env).to(device)
policy_old = Agent(env).to(device)
policy_old.load_state_dict(agent.state_dict())  # copie initiale
optimizer  = optim.Adam(agent.parameters(), lr=LEARNING_RATE)


next_observations = torch.zeros((N_SAMPLE, N_STEPS) + env.observation_space.shape, device=device)
actions      = torch.zeros(N_SAMPLE, N_STEPS, env.action_space.shape[0], device=device)
log_probs    = torch.zeros(N_SAMPLE, N_STEPS, device=device)
entropies    = torch.zeros(N_SAMPLE, N_STEPS, device=device)
rewards      = torch.zeros(N_SAMPLE, N_STEPS, device=device)
dones        = torch.zeros(N_SAMPLE, N_STEPS, device=device)
advantages   = torch.zeros(N_SAMPLE, N_STEPS, device=device)

global_step = 0
start_time = time.time()
observation, info = env.reset()
observation = torch.tensor(observation, dtype=torch.float32).to(device)
print("Observation shape", observation.shape)

# --- Boucle principale
for iteration in range(1, N_ITERATION + 1):
    # 2) Collecte des trajectoires par pas de temps puis épisode
    for step in range(N_STEPS):
        """
        DONC ICI IDEALEMENT IL FAUDRAIT FAIRE UNE COPIE DE L'ENVIRONNEMENT
        MAIS QUID DU STEP A FAIRE ? => COMMENT LE CHOISIR PARMI LES N_SAMPLES ?
        ON POURRAIT PEUT-ETRE REGARDER LA MEILLEURE TRAJECTOIRE ET STEP A LA FIN DE LA BOUCLE N_STEPS
        """
        batch_observation = torch.Tensor(np.array([observation.numpy() for _ in range(N_SAMPLE)]))
        action, logp, ent = agent.get_action_and_value(batch_observation)

        reward = env.compute_rewards(action.cpu().numpy())

        actions[:, step]   = action
        log_probs[:, step] = logp
        entropies[:, step] = ent
        rewards[:, step]   = torch.Tensor(reward)
        # next_observations[i, step] = torch.tensor(next_observation, dtype=torch.float32)

    # 3) Calcul des avantages normalisés
    print(f"Calculating advantages...")
    for step in range(N_STEPS):
        r = rewards[:, step]
        advantages[:, step] = (r - r.mean()) / (r.std() + 1e-8)

    # 4) Boucle d'apprentissage sur plusieurs époques
    print(f"Training...")
    for epoch in range(1, NUM_EPOCH + 1):
        # Recalcul des log-probs sous la policy courante
        new_log_probs = []
        for i in range(N_SAMPLE):
            _ , new_log_probs_item, _  = agent.get_action_and_value(next_observations[i, :], actions[i, :])
            new_log_probs.append(new_log_probs_item)

        new_log_probs = torch.stack(new_log_probs, dim = 0)

        # Calcul du ratio et objectif clipped
        ratios = torch.exp(new_log_probs - log_probs)
        clipped = torch.clamp(ratios, 1 - EPSILON, 1 + EPSILON)
        obj = torch.min(ratios * advantages, clipped * advantages)
        
        # Pénalité KL
        with torch.no_grad():
            ratio_kl = torch.exp(log_probs - new_log_probs)
        kl_loss = (ratio_kl - torch.log(ratio_kl) - 1).mean()


        loss_term = (obj + BETA * kl_loss).mean(dim = 0)

        loss = loss_term.mean()
        # Mise à jour
        torch.autograd.set_detect_anomaly(True)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Iter {iteration} / Epoch {epoch} - Loss π: {loss.item():.4f}, KL: {kl_loss.item():.4f}")

    # 5) Mise à jour de la policy de référence
    policy_old.load_state_dict(agent.state_dict())

Observation shape torch.Size([1, 84, 84])
Calculating advantages...
Training...
Iter 1 / Epoch 1 - Loss π: 1.1160, KL: 1.1160


  File "/home/florenthervers/miniconda3/envs/car_racing_env/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/florenthervers/miniconda3/envs/car_racing_env/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/florenthervers/miniconda3/envs/car_racing_env/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/florenthervers/miniconda3/envs/car_racing_env/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/florenthervers/miniconda3/envs/car_racing_env/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/home/florenthervers/miniconda3/envs/car_racing_env/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/home/florenthervers/mini

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [None]:
import time
import numpy as np
import torch
import torch.optim as optim
import gymnasium as gym
from gymnasium.vector import SyncVectorEnv
from gymnasium.wrappers import ResizeObservation, GrayscaleObservation, FrameStackObservation


LEARNING_RATE = 3e-4
N_ITERATIONS  = 100
N_STEPS       = 128
G             = 8           # group size (formerly N_SAMPLE)
EPSILON       = 0.2
BETA          = 1.0
DEVICE        = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def make_vec_env():
    """
    Create G parallel CarRacing-v3 envs wrapped with preprocessing and reward predictor.
    """
    def make_one_environment():
        env = gym.make(
            "CarRacing-v3",
            continuous=True,
            lap_complete_percent=0.95,
            domain_randomize=False,
            render_mode="rgb_array"
        )
        # Apply your custom reward predictor wrapper
        env = CarRacingRewardPredictor(env)
        # Resize, grayscale, and frame-stack as in your original code
        env = ResizeObservation(env, (84, 84))
        env = GrayscaleObservation(env)
        env = FrameStackObservation(env, 1)
        return env

    return SyncVectorEnv([make_one_environment for _ in range(G)])


# Create vectorized environments
envs = make_vec_env()
obs = envs.reset()[0]                     # returns (obs, infos)
obs = torch.tensor(obs, dtype=torch.float32).to(DEVICE)  # shape: (G,84,84)

# Initialize agent and old policy
agent      = Agent(envs).to(DEVICE)
policy_old = Agent(envs).to(DEVICE)
policy_old.load_state_dict(agent.state_dict())

optimizer = optim.Adam(agent.parameters(), lr=LEARNING_RATE)
start_time = time.time()

for iteration in range(1, N_ITERATIONS + 1):
    # === 1) Collect G trajectories of length N_STEPS ===
    all_obs      = []
    all_actions  = []
    all_logp     = []
    all_rewards  = []

    for t in range(N_STEPS):
        # Sample actions and log-probabilities under current policy
        action, logp, _ = agent.get_action_and_value(obs)

        next_obs, reward, terminations, truncations, info = envs.step(action.cpu().numpy())
        done = np.logical_or(terminations, truncations)
        next_obs = torch.tensor(next_obs, dtype=torch.float32).to(DEVICE)

        all_obs.append(obs)
        all_actions.append(action)
        all_logp.append(logp)
        all_rewards.append(torch.tensor(reward, dtype=torch.float32, device=DEVICE))

        obs = next_obs

    # Stack into tensors of shape (G, T, ...)
    obs_batch      = torch.stack(all_obs,    dim=1)  # (G, T, obs...)
    actions_batch  = torch.stack(all_actions,dim=1)  # (G, T, action_dim)
    logp_old_batch = torch.stack(all_logp,   dim=1)  # (G, T)
    rewards_batch  = torch.stack(all_rewards,dim=1)  # (G, T)

    # === 2) Compute group-relative advantages ===
    # 2a) Full returns per trajectory (no discount)
    returns = rewards_batch.sum(dim=1)             # (G,)

    # 2b) Normalize returns across group
    mean_R = returns.mean()
    std_R  = returns.std(unbiased=False) + 1e-8
    adv    = (returns - mean_R) / std_R            # (G,)

    # 2c) Broadcast to shape (G, T)
    advantages = adv.unsqueeze(1).expand(-1, N_STEPS)

    # === 3) Compute surrogate with integrated forward-KL ===
    # Recompute new log-probs under current policy
    _, new_logp, _ = agent.get_action_and_value(obs_batch.view(-1, *obs_batch.shape[2:]),
                                    actions_batch.view(-1, actions_batch.shape[-1]))
    new_logp = new_logp.view(G, N_STEPS)

    ratio       = torch.exp(new_logp - logp_old_batch)
    clipped_rat = torch.clamp(ratio, 1 - EPSILON, 1 + EPSILON)

    # Surrogate terms
    sur1 = ratio       * advantages
    sur2 = clipped_rat * advantages
    # Forward-KL approx per token
    kl_per_token = new_logp - logp_old_batch

    # Combine surrogate and KL inside the same sum
    surrogate_with_kl = torch.min(sur1, sur2) + BETA * kl_per_token

    # Actor loss: average first over time, then over group (equivalently .mean())
    loss_pi = -surrogate_with_kl.mean()

    # === 4) Update policy ===
    optimizer.zero_grad()
    loss_pi.backward()
    optimizer.step()
    policy_old.load_state_dict(agent.state_dict())

    # Logging
    print(f"Iter {iteration}/{N_ITERATIONS} - loss_pi: {loss_pi.item():.4f}")

duration = time.time() - start_time
print(f"Training completed in {duration:.1f}s")

Iter 1/10 - loss_pi: -0.0000
Iter 2/10 - loss_pi: 0.0000
Iter 3/10 - loss_pi: -0.9896
Iter 4/10 - loss_pi: -0.9896
Iter 5/10 - loss_pi: -0.9896
Iter 6/10 - loss_pi: -0.9896
Iter 7/10 - loss_pi: -0.9896
Iter 8/10 - loss_pi: 0.0000
Iter 9/10 - loss_pi: -0.9896
Iter 10/10 - loss_pi: -0.9896
Training completed in 126.7s
