In [1]:
import time
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import copy

In [31]:
# FROM TP5
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, env):
        super().__init__()
        
        obs_dim = int(np.prod(env.observation_space.shape))
        
        action_dim = env.action_space.shape[0]
        
        
        self.actor = nn.Sequential(
            layer_init(nn.Linear(obs_dim, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, action_dim), std=0.01),
        )
        
        self.log_std = nn.Parameter(torch.zeros(action_dim))

    def transform_action(self, action):
        # Permet de garder les bounds du action space correctes
        transformed = action.clone()
        # Pour le steering
        transformed[..., 0] = torch.tanh(transformed[..., 0])
        # Pour le gas et le brake, si présents
        if transformed.shape[-1] > 1:
            transformed[..., 1:] = torch.sigmoid(transformed[..., 1:])
        return transformed
    
    def get_dist(self, obs):
        if obs.dim() > 2:
            obs = obs.view(1, -1)

        # Calcul de la moyenne
        mean = self.actor(obs)
        # Calcul de l'écart type (on exponentie log_std)
        std = torch.exp(self.log_std).expand_as(mean)

        # Création de la distribution Gaussienne 
        dist = Normal(mean, std)
        
        return dist
        
    def get_action(self, obs, action=None):
        dist = self.get_dist(obs)

        if action is None:
            action = dist.sample()   # Échantillonne une action

        # Calcul de la log-probabilité et de l'entropie
        log_prob = dist.log_prob(action).sum(dim=-1)  # somme sur chaque dimension d'action
        entropy = dist.entropy().sum(dim=-1)

        transformed_action = self.transform_action(action)

        return transformed_action.squeeze(), log_prob, entropy


# ANCIEN

# NEW

In [None]:
LEARNING_RATE = 3e-4
NUM_EPOCH = 10
N_ITERATION = 10
N_STEPS = 128
N_SAMPLE = 64
BATCH_SIZE = 64
EPSILON = 0.2
BETA = 1.0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = gym.make(
            "CarRacing-v3",
            continuous=True,
            lap_complete_percent=0.95,
            domain_randomize=False,
            render_mode="rgb_array"
        )

print("Observation space", env.observation_space.shape)
print("Action space", env.action_space.shape)

# GRPO algorithm: https://www.youtube.com/watch?v=YCawyzAOg1Y
# Inspired from: https://gist.github.com/infoslack/f0e0aec9a882c2c76e1dc1bdd510f279
# Inspired from: https://github.com/XinJingHao/DRL-Pytorch/tree/main
agent = Agent(env).to(device)
optimizer = optim.Adam(agent.parameters(), lr=LEARNING_RATE)

actions = torch.zeros(N_SAMPLE, N_STEPS, env.action_space.shape[0]).to(device)
log_probs = torch.zeros(N_SAMPLE, N_STEPS).to(device)
entropies = torch.zeros(N_SAMPLE, N_STEPS).to(device)
rewards = torch.zeros(N_SAMPLE, N_STEPS).to(device)
next_observations = torch.zeros((N_SAMPLE, N_STEPS) + env.observation_space.shape, device=device)
dones = torch.zeros(N_SAMPLE, N_STEPS).to(device)
advantages = torch.zeros(N_SAMPLE, N_STEPS).to(device)

global_step = 0
start_time = time.time()
observation, info = env.reset()
observation = torch.tensor(observation, dtype=torch.float32).to(device)
print("Observation shape", observation.shape)

for iter in range(1, N_ITERATION + 1):
    print("New iteration")
    for step in range(0, N_STEPS):
        global_step += 1

        with torch.no_grad():
            # Boucle pour échantillonner N_SAMPLE actions => on a plus le critic model
            for i in range(N_SAMPLE):
                action, log_prob, entropy = agent.get_action(observation)
                next_observation, reward, terminated, truncated, info = env.step(action.cpu().numpy())
                done = terminated or truncated

                rewards[i, step] = reward
                next_observations[i, step] = torch.tensor(next_observation, dtype=torch.float32)
                dones[i, step] = done
                actions[i, step] = action
                log_probs[i, step] = log_prob
                entropies[i, step] = entropy

        
        mean_rewards = rewards[:, step].mean()

        std_rewards = rewards[:, step].std() + 1e-8

        advantages[:, step] = (rewards[:, step] - mean_rewards) / std_rewards  # advantages formula => Ait = (Ri - mean(R)) / std(R) => paper
        break
    break

    # IL FAUT MTN LOOP SUR LES EPOCHS CALCULER LES RATIO / CLIPRATIO / KL ETC PUIS UPDATE LE MODELE
        


Observation space (96, 96, 3)
Action space (3,)


NameError: name 'Agent' is not defined

In [None]:
LEARNING_RATE = 3e-4
N_ITERATION   = 10
N_STEPS       = 128
N_SAMPLE      = 64
NUM_EPOCH     = 10
EPSILON       = 0.2
BETA          = 1.0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = gym.make(
    "CarRacing-v3",
    continuous=True,
    lap_complete_percent=0.95,
    domain_randomize=False,
    render_mode="rgb_array"
)

# GRPO algorithm: https://www.youtube.com/watch?v=YCawyzAOg1Y
# Inspired from: https://gist.github.com/infoslack/f0e0aec9a882c2c76e1dc1bdd510f279
# Inspired from: https://github.com/XinJingHao/DRL-Pytorch/tree/main
agent      = Agent(env).to(device)
policy_old = Agent(env).to(device)
policy_old.load_state_dict(agent.state_dict())  # copie initiale
optimizer  = optim.Adam(agent.parameters(), lr=LEARNING_RATE)


next_observations = torch.zeros((N_SAMPLE, N_STEPS) + env.observation_space.shape, device=device)
actions      = torch.zeros(N_SAMPLE, N_STEPS, env.action_space.shape[0], device=device)
log_probs    = torch.zeros(N_SAMPLE, N_STEPS, device=device)
entropies    = torch.zeros(N_SAMPLE, N_STEPS, device=device)
rewards      = torch.zeros(N_SAMPLE, N_STEPS, device=device)
dones        = torch.zeros(N_SAMPLE, N_STEPS, device=device)
advantages   = torch.zeros(N_SAMPLE, N_STEPS, device=device)

global_step = 0
start_time = time.time()
observation, info = env.reset()
observation = torch.tensor(observation, dtype=torch.float32).to(device)
print("Observation shape", observation.shape)

# --- Boucle principale
for iteration in range(1, N_ITERATION + 1):
    # 2) Collecte des trajectoires par pas de temps puis épisode
    for step in range(N_STEPS):
        """
        DONC ICI IDEALEMENT IL FAUDRAIT FAIRE UNE COPIE DE L'ENVIRONNEMENT
        MAIS QUID DU STEP A FAIRE ? => COMMENT LE CHOISIR PARMI LES N_SAMPLES ?
        ON POURRAIT PEUT-ETRE REGARDER LA MEILLEURE TRAJECTOIRE ET STEP A LA FIN DE LA BOUCLE N_STEPS
        """
        for i in range(N_SAMPLE):
            action, logp, ent = agent.get_action(observation)

            next_observation, reward, terminated, truncated, _ = env.step(action.cpu().numpy())
            done = terminated or truncated

            actions[i, step]   = action
            log_probs[i, step] = logp
            entropies[i, step] = ent
            rewards[i, step]   = reward
            dones[i, step]     = done
            next_observations[i, step] = torch.tensor(next_observation, dtype=torch.float32)

    # 3) Calcul des avantages normalisés
    print(f"Calculating advantages...")
    for step in range(N_STEPS):
        r = rewards[:, step]
        advantages[:, step] = (r - r.mean()) / (r.std() + 1e-8)

    # 4) Boucle d'apprentissage sur plusieurs époques
    print(f"Training...")
    for epoch in range(1, NUM_EPOCH + 1):
        # Recalcul des log-probs sous la policy courante
        new_log_probs = torch.zeros_like(log_probs)
        for i in range(N_SAMPLE):
            for step in range(N_STEPS):
                dist = agent.get_dist(next_observations[i, step])
                new_log_probs[i, step] = dist.log_prob(actions[i, step]).sum(dim=-1)

        # Calcul du ratio et objectif clipped
        ratios = torch.exp(new_log_probs - log_probs)
        clipped = torch.clamp(ratios, 1 - EPSILON, 1 + EPSILON)
        obj = torch.min(ratios * advantages, clipped * advantages)
        loss_pi = -obj.mean()

        # Pénalité KL
        with torch.no_grad():
            ratio_kl = torch.exp(log_probs - new_log_probs)
        kl_loss = (ratio_kl - torch.log(ratio_kl) - 1).mean()

        # Mise à jour
        total_loss = loss_pi + BETA * kl_loss
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        print(f"Iter {iteration} / Epoch {epoch} - Loss π: {loss_pi.item():.4f}, KL: {kl_loss.item():.4f}")

    # 5) Mise à jour de la policy de référence
    policy_old.load_state_dict(agent.state_dict())

Observation shape torch.Size([96, 96, 3])
Calculating advantages...
Training...


In [15]:
env = gym.make(
            "CarRacing-v3",
            continuous=True,
            lap_complete_percent=0.95,
            domain_randomize=False,
            render_mode="rgb_array"
        )

In [16]:
type(env)

gymnasium.wrappers.common.TimeLimit

In [27]:
env.unwrapped.car

In [29]:
dir(env.unwrapped)

['__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_create_image_array',
 '_create_track',
 '_destroy',
 '_draw_colored_polygon',
 '_ezpickle_args',
 '_ezpickle_kwargs',
 '_init_colors',
 '_is_protocol',
 '_np_random',
 '_np_random_seed',
 '_reinit_colors',
 '_render',
 '_render_indicators',
 '_render_road',
 'action_space',
 'bg_color',
 'car',
 'clock',
 'close',
 'contactListener_keepref',
 'continuous',
 'domain_randomize',
 'fd_tile',
 'get_wrapper_attr',
 'grass_color',
 'has_wrapper_attr',
 'invisible_state_wind

In [None]:
import pickle

env1 = gym.make(
            "CarRacing-v3",
            continuous=True,
            lap_complete_percent=0.95,
            domain_randomize=False,
            render_mode="human"
        )
env2 = pickle.loads(pickle.dumps(env1))

