# Toys
The goal is to experiment the capabilities of RL. Document each usefull functionalitiy.
This code aim to try:
- different environement
- different agent
- different hardware
- Monitoring learning
- evaluating and comparing
- Complete a table with the KPIs

## Dependecies
- [Gymansium](https://gymnasium.farama.org/): Environments
- stable-baseline3: Agents 

In [None]:
!pip install gymnasium[box2d]
!pip install stable-baselines3

In [1]:
import gymnasium as gym
from gymnasium.wrappers import HumanRendering

import stable_baselines3
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import BaseCallback, CallbackList
from stable_baselines3.common.logger import HParam
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import DQN
from stable_baselines3 import A2C


from matplotlib import pyplot as plt
import numpy as np
import os
import shutil
import torch


from tqdm import tqdm

## Device

In [2]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(f"Using device: {device}")

Using device: cpu


## Setup agent

In [3]:
policy = 'MlpPolicy'
# 'MlpPolicy'
# 'CnnPolicy'

verbose = 0

name = 'a2c_lunar_v1' # Configure

path = f"models/{name}"

tensorboard_log = f"./t_logs/{name}/"
tb_log_name = 'runs'

log_dir = f"./logs/{name}/"

## Monitoring

In [4]:
class TensorboardCallback(BaseCallback):
    
    def __init__(self, verbose=0):
        super().__init__(verbose)

    def _on_step(self):
        exploration_rate = self.model.exploration_rate
        learning_rate = self.model.learning_rate
        self.logger.record("exploration_rate", exploration_rate)
        self.logger.record("learning_rate", learning_rate)
        return True
    
class HParamCallback(BaseCallback):

    def _on_training_start(self):
        hparam_dict = {
            "algorithm":self.model.__class__.__name__,
            "learning rate": self.model.learning_rate,
            "gamma": self.model.gamma
        }
        metric_dict = {
            "rollout/ep_len_mean":0,
            "train/value_loss": 0.0
        }
        self.logger.record(
            "hparams",
            HParam(hparam_dict, metric_dict),
            exclude=("stdout","log","json","csv")
        )

    def _on_step(self):
        return True
    
class SaveOnBestTrainingRewardCallback(BaseCallback):
    
    def __init__(self, check_freq:int, log_dir:str, path:str,verbose = 1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = path
        self.best_mean_reward = -np.inf

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), "timesteps")
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose >= 1:
                print(f"Num timesteps: {self.num_timesteps}")
                print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose >= 1:
                    print(f"Saving new best model to {self.save_path}")
                  self.model.save(self.save_path)

        return True
        
callbacks = CallbackList([
    #TensorboardCallback(),
    #HParamCallback(),
    #SaveOnBestTrainingRewardCallback(1e4,log_dir,path)
])



## Environments

### Setup environment

In [9]:
env_id = 'LunarLander-v2'
# 'LunarLander-v2'
# 'CartPole-v1'

max_episode_steps = 10000

render_mode = 'rgb_array'
# 'human'
# 'rgb_array'

num_envs = 4

vectorization_mode = 'async'
# 'sync'

### Make single environment

In [15]:
env = gym.make(env_id,max_episode_steps, render_mode=render_mode)

### Make vectorize environment

In [20]:
def make_env():
    env1 = gym.make(env_id, max_episode_steps=max_episode_steps, render_mode=render_mode)
    env2 = Monitor(env1, f"{log_dir}")
    return env2

env = DummyVecEnv([make_env for _ in range(num_envs)])

### Debug

In [9]:
print(env)

<stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv object at 0x7c7766884700>


## Agents

### Create agent

In [7]:
model = A2C(
    policy, 
    env, 
    verbose=verbose,
    device=device,
    tensorboard_log=tensorboard_log)
# DQN
# A2C

model.save(path)

### Load agent

In [11]:

model = A2C.load(path,env,tensorboard_log=tensorboard_log,device=device)
# DQN
# A2C

### Train agent

In [12]:
total_timesteps = 1e4

if os.path.exists(log_dir):
    shutil.rmtree(log_dir)
    os.mkdir(log_dir)  

#env = Monitor(env, f"{log_dir}logs")

model = A2C.load(path,env,tensorboard_log=tensorboard_log)
# DQN
# A2C

model.learn(
    total_timesteps, 
    progress_bar=True, 
    reset_num_timesteps=False, 
    tb_log_name=tb_log_name,
    callback=callbacks
    )

model.save(path)

Output()

### Monitor 

In [None]:
!tensorboard --logdir ./logs/tesorboard_log_{name}

### Evaluate agent

In [26]:
eval_env = DummyVecEnv([make_env])
# eval_env = gym.make(env_id, max_episode_steps=max_episode_steps,render_mode='rgb_array')
# eval_env = Monitor(eval_env)

n_eval_episodes = 10

mean_reward, std_reward = evaluate_policy(model,eval_env,n_eval_episodes)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=185.92 +/- 52.03781275543837


### Display agent

In [13]:

display_env = gym.make(env_id, max_episode_steps=max_episode_steps,render_mode='human') #HumanRendering(env)

model = A2C.load(path)
# DQN
# A2C

obs, info = display_env.reset()

while True:
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, info = display_env.step(action)
    if terminated or truncated:
        break

