### Importation des librairies

In [17]:
# !pip install highway-env
# !pip install git+https://github.com/DLR-RM/stable-baselines3

In [18]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*env.compute_reward to get variables from other wrappers is deprecated.*") #ignore warning
import numpy as np
from copy import deepcopy
import os

# Environment
import gymnasium as gym
from config3 import env

# Agent
from stable_baselines3 import HerReplayBuffer, SAC, PPO, A2C, DDPG, TD3

### Wrapper pour aplatir les observations de l'environnement

In [19]:
class FlattenObservation(gym.Wrapper):
    def __init__(self, env: gym.Env):
        super().__init__(env)

    def observation(self, observation: np.ndarray) -> np.ndarray:
        return observation.flatten()

# Entraînement de modèles d'apprentissage par renforcement pour la tâche de stationnement

In [20]:
STEPS = 30_000
train_her_sac = False
train_sac = False
train_ppo = False
train_a2c = False
train_ddpg = False
train_td3 = False


### Modèle SAC

In [21]:
if train_sac:
    env_train = FlattenObservation(deepcopy(env))
    model = SAC('MultiInputPolicy', env_train, verbose=1,
                tensorboard_log="logs",
                buffer_size=int(1e6),
                learning_rate=1e-3,
                gamma=0.95, batch_size=1024, tau=0.05,
                learning_starts=1000,
                device="cpu",
                policy_kwargs=dict(net_arch=[512, 512, 512]))
    
    model.learn(total_timesteps=int(STEPS))

    model.save("parking_SAC/model")

### Modèle SAC et HER

In [22]:
if train_her_sac:
    env_train = FlattenObservation(deepcopy(env))
    her_kwargs = dict(n_sampled_goal=4, goal_selection_strategy='future')
    model = SAC('MultiInputPolicy', env_train, replay_buffer_class=HerReplayBuffer,
                replay_buffer_kwargs=her_kwargs, verbose=1,
                tensorboard_log="logs",
                buffer_size=int(1e6),
                learning_rate=1e-3,
                gamma=0.95, batch_size=1024, tau=0.05,
                learning_starts=1000,  
                device="cpu",
                policy_kwargs=dict(net_arch=[512, 512, 512]))
    
    model.learn(total_timesteps=int(STEPS))

    model.save("parking_SAC_HER/model")

### Modèle PPO

In [23]:
if train_ppo:
    env_train = FlattenObservation(deepcopy(env))
    model = PPO('MultiInputPolicy', env_train, verbose=1,
                tensorboard_log="logs",
                policy_kwargs=dict(net_arch=[512, 512, 512]),
                learning_rate=1e-3,
                gamma=0.95,
                n_steps=2048,
                ent_coef=0.0,
                vf_coef=0.5,
                max_grad_norm=0.5,
                batch_size=64,
                n_epochs=10,
                clip_range=0.2,
                device="cpu")
    
    model.learn(total_timesteps=int(STEPS))

    model.save("parking_PPO/model")

### Modèle A2C

In [24]:
if train_a2c:
    env_train = FlattenObservation(deepcopy(env))
    model = A2C('MultiInputPolicy', env_train, verbose=1,
                tensorboard_log="logs",
                policy_kwargs=dict(net_arch=[512, 512, 512]),
                device="auto",
                n_steps=5,
                ent_coef=0.01,
                learning_rate=1e-3,
                gamma=0.95,
                vf_coef=0.5,
                max_grad_norm=0.5)

    model.learn(total_timesteps=int(STEPS))

    model.save("parking_A2C/model")

### Modèle DDPG

In [25]:
if train_ddpg:
    env_train = FlattenObservation(deepcopy(env))
    model = DDPG('MultiInputPolicy', env_train, verbose=1,
                 tensorboard_log="logs",
                 buffer_size=int(1e6),
                 learning_rate=1e-3,
                 gamma=0.95, batch_size=1024, tau=0.05,
                 learning_starts=1000,
                 device="cpu",
                 policy_kwargs=dict(net_arch=[512, 512, 512]))
    
    model.learn(total_timesteps=int(STEPS))

    model.save("parking_DDPG/model")

### Modèle TD3

In [26]:
if train_td3:
    env_train = FlattenObservation(deepcopy(env))
    model = TD3('MultiInputPolicy', env_train, verbose=1,
                tensorboard_log="logs",
                buffer_size=int(1e6),
                learning_rate=1e-3,
                gamma=0.95, batch_size=1024, tau=0.05,
                policy_delay=2, 
                learning_starts=1000,
                device="auto",
                policy_kwargs=dict(net_arch=[512, 512, 512]))

    model.learn(total_timesteps=int(STEPS))

    model.save("parking_TD3/model")

# Entraînement du modèle SAC avec HER et différentes architectures de réseau

In [None]:
train_her_sac_3x512 = False
train_her_sac_3x256 = False
train_her_sac_3x128 = False
train_her_sac_2x512 = True
train_her_sac_2x256 = False
train_her_sac_2x128 = False
train_her_sac_4x256 = False
train_her_sac_4x128 = False
train_her_sac_multi = False
train_her_sac_multi2 = False

def train_her_sac_model(env, model_name, net_arch):
    env_train = FlattenObservation(deepcopy(env))
    her_kwargs = dict(n_sampled_goal=4, goal_selection_strategy='future')
    model = SAC('MultiInputPolicy', env_train, replay_buffer_class=HerReplayBuffer,
        replay_buffer_kwargs=her_kwargs, verbose=1,
        tensorboard_log="logs",
        buffer_size=int(1e6),
        learning_rate=1e-3,
        gamma=0.95, batch_size=1024, tau=0.05,
        learning_starts=1000,
        device="cpu",
        policy_kwargs=dict(net_arch=net_arch))
    model.learn(total_timesteps=int(STEPS))
    model.save(f"parking_SAC_HER_{model_name}/model")

if train_her_sac_3x512:
    train_her_sac_model(env, "train_her_sac_3x512", [512, 512, 512])

if train_her_sac_3x256:
    train_her_sac_model(env, "train_her_sac_3x256", [256, 256, 256])

if train_her_sac_3x128:
    train_her_sac_model(env, "train_her_sac_3x128", [128, 128, 128])

if train_her_sac_2x512:
    train_her_sac_model(env, "train_her_sac_2x512", [512, 512])

if train_her_sac_2x256:
    train_her_sac_model(env, "train_her_sac_2x256", [256, 256])

if train_her_sac_2x128:
    train_her_sac_model(env, "train_her_sac_2x128", [128, 128])

if train_her_sac_4x256:
    train_her_sac_model(env, "train_her_sac_4x256", [256, 256, 256, 256])

if train_her_sac_4x128:
    train_her_sac_model(env, "train_her_sac_4x128", [128, 128, 128, 128])

if train_her_sac_multi:
    train_her_sac_model(env, "train_her_sac_multi", [128, 256, 128])

if train_her_sac_multi2:
    train_her_sac_model(env, "train_her_sac_multi2", [256, 512, 256])

# Affichage et enregistrement de la voiture

### Affichage de la voiture se garant 

In [27]:
display_her_sac = False

if display_her_sac:
    env_display = FlattenObservation(deepcopy(env))
    model = SAC.load("parking_SAC_HER_train_her_sac_3x256/model", env=env_display)
    for i in range(10):
        obs, info = env_display.reset()
        done = truncated = False
        while not (done or truncated):
            action, _ = model.predict(obs)
            obs, reward, done, truncated, info = env_display.step(action)
            env_display.render()

        print(i, info['is_success'])
    env_display.close()


### Enregistrement des images de fails, pour connaître leurs causes

In [28]:
save_fails = False

import os
import cv2

# Créer un dossier pour stocker les images
if not os.path.exists('failed_images'):
    os.makedirs('failed_images')

if save_fails:
    env_display = FlattenObservation(deepcopy(env))
    model = SAC.load("parking_SAC_HER_train_her_sac_3x256/model", env=env_display)
    for i in range(2000):
        obs, info = env_display.reset()
        done = truncated = False
        j=0
        while not (done or truncated):
            j+=1
            action, _ = model.predict(obs)
            obs, reward, done, truncated, info = env_display.step(action)
        

        # Enregistrer la dernière image si is_success est False
        if not info['is_success']:
            print(i, j, info['is_success'])
            last_image = env_display.render()
            if last_image is not None:
                cv2.imwrite(os.path.join('failed_images', f'image_{i}.png'), last_image)


    env_display.close()


# Affichage des statistiques d'entrainement avec TensorBoard

In [30]:
%load_ext tensorboard
%tensorboard --logdir logs



# http://localhost:6006

# Ctrl/Cmd + Shift + P
# Python: Launch TensorBoard

%reload_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 14490), started 21:38:02 ago. (Use '!kill 14490' to kill it.)