## Reinforcement Learning Project: Highway-Env

Dieses Projekt ist entstanden in Anlehnung an die Implementierung von [Edouard Leurent](https://github.com/Farama-Foundation/HighwayEnv/blob/master/scripts/sb3_highway_dqn.ipynb), sowie [SAKSHAY MAHNA](https://www.kaggle.com/code/sakshaymahna/highway-dqn/notebook).


In [1]:
!apt install -y ffmpeg

Der Befehl "apt" ist entweder falsch geschrieben oder
konnte nicht gefunden werden.


In [4]:
!pip install rich pyvirtualdisplay

Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Installing collected packages: pyvirtualdisplay
Successfully installed pyvirtualdisplay-3.0



[notice] A new release of pip available: 22.2.2 -> 23.2.1
[notice] To update, run: C:\Users\Tobia\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip


### 1. Import of Libraries

In [8]:
# vollständiger Import
import highway_env

# Alias Import
import gymnasium as gym

# Spezifischer Import
from IPython import display
from matplotlib import pyplot as plt, animation
from pathlib import Path
from pyvirtualdisplay import Display
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback

In [6]:
%matplotlib inline

In [11]:
%load_ext tensorboard

### 2. Definition of Constants and Paths
#### a) Constants

In [3]:
# Umgebung
ENVIRONMENT_NAME: str = "highway-v0"
ENVIRONMENT_RENDER_MODE: str = "rgb_array"

In [9]:
# Modelle
BASE_MODEL_POLICY: str = "MlpPolicy"
BASE_MODEL_LIB: str = "stable_baselines3"
DQN_MODEL_TYPE: str = "dqn"
PPO_MODEL_TYPE: str = "ppo"

DQN_MODEL_VERSION: str = "v3"
PPO_MODEL_VERSION: str = "v1"

In [20]:
# Training
TOTAL_TRAINING_STEPS: int = 30000
SAVE_MODEL_FREQUENZ: int = 3000
EVAL_MODEL_FREQUENZ: int = 1500

#### b) Paths

In [13]:
# Definition von allgemeinen Pfaden
CURRENT_FILE_DIR_PATH = Path().absolute()
REPOSITORY_PATH = CURRENT_FILE_DIR_PATH.parent.absolute()
LOG_BASE_PATH = REPOSITORY_PATH.joinpath("logs", BASE_MODEL_LIB)
VIDEOS_BASE_PATH = REPOSITORY_PATH.joinpath("videos", BASE_MODEL_LIB)
MODELS_BASE_PATH = REPOSITORY_PATH.joinpath("highway_models")

In [12]:
# DQN
LOG_DQN_PATH = LOG_BASE_PATH.joinpath(DQN_MODEL_TYPE)
VIDEOS_DQN_PATH = VIDEOS_BASE_PATH.joinpath(DQN_MODEL_TYPE)
MODELS_DQN_PATH = MODELS_BASE_PATH.joinpath(DQN_MODEL_TYPE, BASE_MODEL_LIB, DQN_MODEL_VERSION)

In [14]:
# PPO
LOG_PPO_PATH = LOG_BASE_PATH.joinpath(PPO_MODEL_TYPE)
VIDEOS_PPO_PATH = VIDEOS_BASE_PATH.joinpath(PPO_MODEL_TYPE)
MODELS_PPO_PATH = MODELS_BASE_PATH.joinpath(PPO_MODEL_TYPE, BASE_MODEL_LIB, PPO_MODEL_VERSION)

### 2. Definition of Objects and Functions
#### a) Objects

In [9]:
# Definition von einen Display-Objekt
display = Display(visible=0, size=(1024, 768))
display.start()

FileNotFoundError: [WinError 2] Das System kann die angegebene Datei nicht finden

In [21]:
# Kombinierung der Modellinformationen
LIST_MODEL_TYPE: [str] = [DQN_MODEL_TYPE, PPO_MODEL_TYPE]
LIST_MODEL_VERSION: [str] = [DQN_MODEL_VERSION, PPO_MODEL_VERSION]

In [22]:
# Kombinierung der Pfade
LIST_LOG_PATH: [str] = [LOG_DQN_PATH, LOG_PPO_PATH]
LIST_VIDEOS_PATH: [str] = [VIDEOS_DQN_PATH, VIDEOS_PPO_PATH]
LIST_MODEL_PATH: [str] = [MODELS_DQN_PATH, MODELS_PPO_PATH]

#### b) Functions

In [19]:
def prepare_env(
        gymnasium_env: gym.Env,
        log_dir_path: Path,
        print_info: bool = False) -> gym.Env:
    """
    Funktion zum Vorbereiten der Gymnasiumsumgebungen für das Training mit StableBaselines3 (unabhängig vom Agenten)
    :param gymnasium_env: Umgebung welche verwendet werden soll
    :param log_dir_path: Pfad, auf welchen die Logs gespeichert werden sollen
    :param print_info: Boolean, welcher aussagt, ob die Information ausgegeben werden soll
    :return: vorbereitete Umgebung
    """
    # Verpacken Sie die Umgebung mit einem Monitor-Wrapper
    env = Monitor(gymnasium_env, str(log_dir_path))

    # Vektorisieren Sie die Umgebung (notwendig für Stable Baselines3)
    env = DummyVecEnv([lambda: env])
    if not print_info:
        return env
    else:
        print(f'<Info> : Log will be written to the following dir {log_dir_path.absolute()}')

In [37]:
def train_multi_agents(
        base_policy: str,
        environment: gym.Env,
        eval_env: gym.Env,
        model_list: list[str],
        log_list: list[Path],
        save_path_list: list[Path],
        total_timestep: int,
        save_freq: int,
        eval_freq: int,
        log_info: bool,
        verbose: int = 1,
        process_bar: bool = True
):
    """

    :param base_policy:
    :param environment:
    :param eval_env:
    :param model_list:
    :param log_list:
    :param save_path_list:
    :param total_timestep:
    :param save_freq:
    :param eval_freq:
    :param log_info:
    :param verbose:
    :param process_bar:
    :return:
    """
    # Definitionen von Hilfsobjekte
    model = None
    trained_model: list = []
    for model_type, log_path, save_path in zip(model_list, log_list, save_path_list):
        # Vorbereiten der Umgebung
        env = prepare_env(
            gymnasium_env=environment,
            log_dir_path=log_path,
            print_info=log_info
        )

        # Erstellung des Modells
        if model_type == 'dqn':
            model = DQN(
                policy=base_policy,
                env=env,
                verbose=verbose,
                tensorboard_log=log_path
            )
        elif model_type == 'ppo':
            model = PPO(
                policy=base_policy,
                env=env,
                verbose=verbose,
                tensorboard_log=log_path
            )
        else:
            raise Exception("Undefinierter Modell-Type")

        # Erstellung der Callbacks:
        checkpoint_callback = CheckpointCallback(
            save_freq=save_freq,
            save_path=save_path,
            name_prefix=f'model_{model_type}')

        # Erstellen Sie eine EvalCallback
        eval_callback = EvalCallback(
            eval_env,
            best_model_save_path=save_path,
            log_path=log_path,
            eval_freq=eval_freq)

        # Verbindung der Callbacks
        callbacks = [checkpoint_callback, eval_callback]

        # trainiere das Modell
        model.learn(
            total_timesteps=total_timestep,
            callback=callbacks,
            progress_bar=process_bar
        )
        trained_model.append(model)
        model = None
    return trained_model

In [None]:
# Kopiert von: https://www.kaggle.com/code/sakshaymahna/highway-dqn
def create_anim(frames, dpi, fps):
    plt.figure(figsize=(frames[0].shape[1] / dpi, frames[0].shape[0] / dpi), dpi=dpi)
    patch = plt.imshow(frames[0])

    def setup():
        plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, init_func=setup, frames=len(frames), interval=fps)
    return anim


def display_anim(frames, dpi=72, fps=50):
    anim = create_anim(frames, dpi, fps)
    return anim.to_jshtml()


def save_anim(frames, filename, dpi=72, fps=50):
    anim = create_anim(frames, dpi, fps)
    anim.save(filename)

### 3. Definition der Umgebungen
#### a) Trainingsumgebung

In [16]:
# Definition der Trainingsumgebung
train_env = gym.make(ENVIRONMENT_NAME)

  logger.warn(


#### b) Evaluierungsumgebung

In [17]:
# Definition der Evaluierungsumgebung
eval_env = gym.make(ENVIRONMENT_NAME)

  logger.warn(


#### c) Visualisationsumgebung

In [18]:
# Definition der Visualisationsumgebung
vis_env = gym.make(ENVIRONMENT_NAME)

  logger.warn(


### 4. Definition von Callbacks
#### a) Checkpoints (Beispiel - DQN)

In [27]:
checkpoint_callback_dqn = CheckpointCallback(
    save_freq=SAVE_MODEL_FREQUENZ,
    save_path=MODELS_DQN_PATH,
    name_prefix="dqn_model")

#### b) Evaluation (Beispiel - DQN)

In [28]:
eval_callback_dqn = EvalCallback(
    eval_env,
    best_model_save_path=MODELS_DQN_PATH,
    log_path=LOG_DQN_PATH,
    eval_freq=EVAL_MODEL_FREQUENZ)

### 5. Training von Modellen
#### a) DQN - Beispiel

In [30]:
# Erstellen Sie das Modell
model_dqn = DQN(
    BASE_MODEL_POLICY,
    prepare_env(
        gymnasium_env=train_env,
        log_dir_path=LOG_DQN_PATH
    ),
    verbose=1,
    tensorboard_log=LOG_DQN_PATH)

Using cpu device


In [None]:
# Trainieren Sie das Modell
model_dqn.learn(
    total_timesteps=TOTAL_TRAINING_STEPS,
    callback=[checkpoint_callback_dqn, eval_callback_dqn],
    progress_bar=True)

#### b) PPO - Beispiel

In [31]:
# Erstellen Sie das Modell
model_ppo = PPO(
    BASE_MODEL_POLICY,
    prepare_env(
        gymnasium_env=train_env,
        log_dir_path=LOG_PPO_PATH
    ),
    verbose=1,
    tensorboard_log=LOG_PPO_PATH)

Using cpu device


In [None]:
# Trainieren Sie das Modell
model_ppo.learn(
    total_timesteps=TOTAL_TRAINING_STEPS,
    #   callback=[checkpoint_callback_ppo, eval_callback_ppo],
    progress_bar=True)

#### c) Trainiere Multi-Agenten

In [38]:
trained_agents = train_multi_agents(
    base_policy=BASE_MODEL_POLICY,
    environment=train_env,
    eval_env=eval_env,
    model_list=LIST_MODEL_TYPE,
    log_list=LIST_LOG_PATH,
    save_path_list=LIST_MODEL_PATH,
    total_timestep=TOTAL_TRAINING_STEPS,
    eval_freq=EVAL_MODEL_FREQUENZ,
    save_freq=SAVE_MODEL_FREQUENZ,
    log_info=False
)

Using cpu device
Logging to D:\DHBW\JetBrains\Hand-on-Reinforced-Learning\logs\stable_baselines3\dqn\DQN_1


Output()

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 11       |
|    ep_rew_mean      | 8.27     |
|    exploration_rate | 0.986    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1        |
|    time_elapsed     | 24       |
|    total_timesteps  | 44       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 11.2     |
|    ep_rew_mean      | 8.68     |
|    exploration_rate | 0.972    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1        |
|    time_elapsed     | 50       |
|    total_timesteps  | 90       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10       |
|    ep_rew_mean      | 7.63     |
|    exploration_rate | 0.962    |
| time/               |          |
|    episodes       

KeyboardInterrupt: 

### 6. Evaluation der Modelle

In [10]:
# Tensorboard DQN
%tensorboard --logdir {LOG_DQN_PATH.absolute()}

UsageError: Line magic function `%tensorboard` not found.


In [None]:
# Tensorboard PPO
%tensorboard --logdir {LOG_PPO_PATH.absolute()}