# LA PREMIERE CELL EST POUR TE MONTRER COMMENT LES TRANSFORMATIONS FONCTIONNENT ET COMMENT L'ENVIRONNEMENT EST MODIFIE

In [None]:
import gymnasium as gym
from sb3_contrib import RecurrentPPO  # PPO récurrent (LSTM)
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from gymnasium.wrappers import ResizeObservation, GrayscaleObservation
import torch.nn as nn


# référence pour le dictionnaire d'hyperparamètre et des transformations: https://huggingface.co/sb3/ppo_lstm-CarRacing-v0

"""
J'UTILISE DUMMYVECENV ET VECFRAMESTACK CAR LE FrameStackObservation fait bug, chatgpt a proposé cette alternative
Voici sa réponse:
"Le problème vient du fait que Stable-Baselines3 (SB3) et ses algorithmes (y compris RecurrentPPO) s'attendent généralement à un environnement 
vectorisé et utilisent habituellement la classe VecFrameStack pour empiler les observations, plutôt qu'un wrapper Gymnasium standard comme FrameStackObservation

SB3 fonctionne de façon optimale avec un VecEnv (environnement vectorisé), par exemple DummyVecEnv ou SubprocVecEnv

L'empilement d'images se fait alors via VecFrameStack (fournie par SB3) plutôt qu'un wrapper Gymnasium 
(qui n'est pas forcément compatible avec la vérification d'espace d'observation qu'effectue SB3)"


=> C'est simplement l'implementation des environements parallèles de stableBaseline (au lieu de celle de gym utilisée dans le TP)
"""

def make_env():
    def _init():
        env = gym.make("CarRacing-v3", continuous=True, render_mode="rgb_array")
        env = ResizeObservation(env, (64, 64))
        env = GrayscaleObservation(env, keep_dim=True)
        return env
    return _init

print("Loading CarRacing-v3 environment")
env = gym.make("CarRacing-v3", continuous=True, render_mode="rgb_array")
print("Observation space size:", env.observation_space.shape)
env = ResizeObservation(env, (64, 64)) # 64 from the dict
print("Observation space resize:", env.observation_space.shape)
print("Observation space:", env.observation_space)
env = GrayscaleObservation(env, keep_dim = True)
print("Observation space after gray scaling:", env.observation_space)

n_envs = 8
env = DummyVecEnv([make_env() for _ in range(n_envs)])
env = VecFrameStack(env, n_stack=2)
print(env.observation_space)

Loading CarRacing-v3 environment
Observation space size: (96, 96, 3)
Observation space resize: (64, 64, 3)
Observation space: Box(0, 255, (64, 64, 3), uint8)
Observation space after gray scaling: Box(0, 255, (64, 64, 1), uint8)
Observation space size after frame stacking: Box(0, 255, (64, 64, 1), uint8)
Box(0, 255, (64, 64, 2), uint8)


In [None]:
import gymnasium as gym
from sb3_contrib import RecurrentPPO  # PPO récurrent (LSTM)
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecNormalize
from gymnasium.wrappers import ResizeObservation, GrayscaleObservation
import torch.nn as nn
import wandb
from wandb.integration.sb3 import WandbCallback

# référence pour le dictionnaire d'hyperparamètre et des transformations: https://huggingface.co/sb3/ppo_lstm-CarRacing-v0

"""
DANS LE DICT IL Y A ENCORE

https://rl-baselines3-zoo.readthedocs.io/en/master/guide/config.html

('learning_rate', 'lin_1e-4') => decrease linéaire du lr je suppose (cfr TP5)
('normalize', "{'norm_obs': False, 'norm_reward': True}") => Normalization des reward mais pas des observations
('normalize_kwargs', {'norm_obs': False, 'norm_reward': False})] => à investiguer, c'est chelou car par cohérant avec le champ précédent
"""

# Dictionnaire policy kwargs venant du dict
policy_kwargs = {
    "log_std_init": -2,
    "ortho_init": False,
    "enable_critic_lstm": False,
    "activation_fn": nn.GELU,
    "lstm_hidden_size": 128,
}

# From https://github.com/DLR-RM/rl-baselines3-zoo/blob/325ef5dafe46e483ce9d727d2851aff18b70db7c/rl_zoo3/utils.py#L295
def linear_schedule(progress_remaining, init_lr):
    return progress_remaining * init_lr

# Nombre total de timesteps pour l'entraînement (n_timesteps = 4e6), je pense ça fait référence à n_timesteps dans le dict
total_timesteps = 2000000 #4000000.0

# Dictionnaire d'hyperparamètres venant du dict
hyperparams = {
    "batch_size": 128,
    "clip_range": 0.2,
    "ent_coef": 0.0,
    "gae_lambda": 0.95,
    "gamma": 0.99,
    "learning_rate": lambda x: linear_schedule(x, 1e-4),
    "max_grad_norm": 0.5,
    "n_epochs": 10,
    "n_steps": 512,
    "sde_sample_freq": 4,
    "use_sde": True,
    "vf_coef": 0.5,
    "policy_kwargs": policy_kwargs
}

run = wandb.init(
    entity="Rl2025-project",
    project="RL Project",
    name="First_full_test",
    config= hyperparams,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
    # save_code=True,  # optional
)

def make_env():
    def _init():
        env = gym.make("CarRacing-v3", continuous=True, lap_complete_percent=0.95, domain_randomize=False, render_mode="rgb_array") # entrainement sur "rgb-array" car plus opti en terme de compute, le mode human display qqch
        env = ResizeObservation(env, (64, 64))
        env = GrayscaleObservation(env, keep_dim=True)
        return env
    return _init

print("Loading CarRacing-v3 environment")
n_envs = 8
env = DummyVecEnv([make_env() for _ in range(n_envs)])
env = VecFrameStack(env, n_stack=2)
env = VecNormalize(env, norm_reward=True, norm_obs=False)

# Instanciation du modèle RecurrentPPO avec la politique 'CnnLstmPolicy'
print("Doing PPOLSTM")
model = RecurrentPPO("CnnLstmPolicy", env, tensorboard_log=f"runs/{run.id}", **hyperparams)

model.learn(
    total_timesteps=total_timesteps,
    callback=WandbCallback(
        verbose=2,
    ),
)

wandb.finish()
# Sauvegarde du modèle final
model.save("q2_final")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mflo230702[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading CarRacing-v3 environment
Doing PPOLSTM


  return torch._C._cuda_getDeviceCount() > 0


Output()

In [1]:

import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
import wandb
from wandb.integration.sb3 import WandbCallback


config = {
    "policy_type": "MlpPolicy",
    "total_timesteps": 25000,
    "env_name": "CartPole-v1",
}
run = wandb.init(
    project="RL Project",
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
)


def make_env():
    env = gym.make(config["env_name"])
    env = Monitor(env)  # record stats such as returns
    return env


env = DummyVecEnv([make_env])

model = PPO(config["policy_type"], env, verbose=1, tensorboard_log=f"runs/{run.id}")
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        verbose=2,
    ),
)
run.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mflo230702[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return torch._C._cuda_getDeviceCount() > 0


Using cpu device
Logging to runs/f7vigkl3/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.5     |
|    ep_rew_mean     | 22.5     |
| time/              |          |
|    fps             | 1313     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 27.9        |
|    ep_rew_mean          | 27.9        |
| time/                   |             |
|    fps                  | 967         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008340905 |
|    clip_fraction        | 0.111       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance 

0,1
global_step,▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▂▂▃▄▄▅▆▆▇█
rollout/ep_rew_mean,▁▁▁▂▂▃▄▄▅▆▆▇█
time/fps,█▄▃▂▂▂▂▁▁▁▁▁▁
train/approx_kl,▆▇▆█▆▃▄▃▅▁▁█
train/clip_fraction,█▅▅▆▄▂▄▂▄▁▁▅
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▂▃▄▅▅▅▆▆▇▇█
train/explained_variance,▁▂▃▄▄▆▇▅▇▁▇█
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,26624.0
rollout/ep_len_mean,203.95
rollout/ep_rew_mean,203.95
time/fps,687.0
train/approx_kl,0.01
train/clip_fraction,0.06587
train/clip_range,0.2
train/entropy_loss,-0.50097
train/explained_variance,0.8627
train/learning_rate,0.0003
