# SimpleSim Non-Holonomic Navigation Challenge

This notebook attempts to train an agent solve a simplesim non-holonomic driving navigation problem with 1 target with random spawn location.
This time, however we are teaching the agent to dwell at the goal as well instead of simply ending the episode.


## Install Dependencies and Stable Baselines3 Using Pip

In [1]:
# !pip install "stable-baselines3[extra]>=2.0.0a4"

### Setup Tensorboard Logging

In [2]:
# # Clear any logs from previous runs
# !rm -rf ./logs/

# Load the TensorBoard notebook extension
%load_ext tensorboard

##  Custom Gym Envs

Below are a couple of simpler lower order gridworld type gym environments that can be used as testing and debugging examples ,as well as our main SimpleSIm non-holonomic driving environment (which is imported from the seperate source files env.py and env_gym.py)

In [3]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

In [4]:
# Import our main environment
from env_gym import SimpleSimGym

### Validate the environment

Stable Baselines3 provides a [helper](https://stable-baselines3.readthedocs.io/en/master/common/env_checker.html) to check that your environment follows the Gym interface. It also optionally checks that the environment is compatible with Stable-Baselines (and emits warning if necessary).

In [5]:
from stable_baselines3.common.env_checker import check_env

In [6]:
env = SimpleSimGym(max_budget=500, max_targets=3, num_classes=10, player_fov=60)

# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

## Setup Callbacks

### Auto Saving of the Best Model Callback

Using the monitoring wrapper, we can save statistics of the environment, and use them to determine the mean training reward. This allows us to save the best model while training.

In [7]:
import os
import numpy as np
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy

In [8]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """

    def __init__(self, check_freq, log_dir, save_dir, model_name, verbose=1):
    # def __init__(self, check_freq, log_dir, save_dir, filename, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir + model_name
        self.save_path = os.path.join(save_dir, f"best_{model_name}")
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.log_dir is not None:
            os.makedirs(self.log_dir, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), "timesteps")
            if len(x) > 0:
                # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print("Num timesteps: {}".format(self.num_timesteps))
                    print(
                        "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(
                            self.best_mean_reward, mean_reward
                        )
                    )

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    # if self.verbose > 0:
                    print(f"Saving new best model at {x[-1]} timesteps (saved to {self.save_path}) (reward={mean_reward})")
                    self.model.save(self.save_path)

        return True

### Baseline Testing the Environment

Test the performance of an untrained (random) policy on the environment so that we can get a baseline performance to compare to.

In [9]:
# obs, _ = env.reset()
# env.render()

# print(env.observation_space)
# print(env.action_space)
# print(env.action_space.sample())

#   for step in range(n_steps):
#       env.observation_space.sample()
#       obs, reward, terminated, truncated, info = env.step(action)
#       done = terminated or truncated
#       print("obs=", obs, "reward=", reward, "done=", done)
#       env.render()
#       # Then, go down
#       print(f"Step {step + 1}")
#       obs, reward, terminated, truncated, info = env.step(GO_DOWN)
#       done = terminated or truncated
#       print("obs=", obs, "reward=", reward, "done=", done)
#       env.render()

#       if done:
#           print("Goal reached!", "reward=", reward)
#           break

## Train The Model

In [10]:
from stable_baselines3 import PPO, SAC, DQN
from stable_baselines3.common.env_util import make_vec_env

# Environment Parameters
MAX_BUDGET = 400
MAX_TARGETS = 5
NUM_CLASSES = 10
PLAYER_FOV = 30
RENDER_MODE = "rgb_array"
ACTION_FORMAT = "continuous"

config = {
    "policy": 'MlpPolicy',
    "total_timesteps": 2_000_000,
    "logdir": "logs/",
    "savedir": "saved_models/",
}

# Create log dir
os.makedirs(config["logdir"], exist_ok=True)

# Create save dir
os.makedirs(config["savedir"], exist_ok=True)

# Show Tensorboard Logs
Visualise the live logs on tensorboard as we train

In [11]:
# Open Tensorboard Logging
%tensorboard --logdir logs/ --reload_multifile True --reload_interval 30 --port 6008

### Train DQN

In [12]:
# Instantiate and wrap the env
env_dqn = make_vec_env(SimpleSimGym, 
                   n_envs=1, 
                   monitor_dir=config["logdir"]+"dqn", 
                   env_kwargs=dict(
                       max_budget=MAX_BUDGET, 
                       max_targets=MAX_TARGETS, 
                       num_classes=NUM_CLASSES, 
                       player_fov=PLAYER_FOV, 
                       render_mode=RENDER_MODE, 
                       action_format="discrete"))

# Setup callbacks
auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=config["logdir"], save_dir=config["savedir"], model_name="dqn", verbose=0)

# # Load the model from checkpoint 6M
# model_dqn = DQN.load(f"{config['savedir']}/MlpPolicy_DQN_step8000000", env_dqn)
# Create the agent
model_dqn = DQN(config["policy"], env_dqn, tensorboard_log=config["logdir"], verbose=0)

# Train in tranches
times_trained = 0
num_tranches = 10 * (1+times_trained)
# SHOULD BE: range(10 * times_trained, num_tranches):
for i in range(10 * times_trained, num_tranches+1):
    print(f"^ Tranch {i}/{num_tranches}")
    model_dqn.learn(config["total_timesteps"]//num_tranches, tb_log_name="DQN", callback=auto_save_callback, progress_bar=True, reset_num_timesteps=False)
    model_dqn.save(f"{config['savedir']}/{config['policy']}_DQN_step{i * (config['total_timesteps']//num_tranches)}")

Output()

^ Tranch 0/10


Output()

^ Tranch 1/10


Output()

^ Tranch 2/10


Output()

^ Tranch 3/10


Output()

^ Tranch 4/10


Output()

^ Tranch 5/10


Output()

^ Tranch 6/10


Output()

^ Tranch 7/10


Output()

^ Tranch 8/10


Output()

^ Tranch 9/10


Output()

^ Tranch 10/10


### Train PPO

In [13]:
# Instantiate and wrap the env
env_ppo = make_vec_env(SimpleSimGym, 
                   n_envs=1, 
                   monitor_dir=config["logdir"]+"ppo", 
                   env_kwargs=dict(
                       max_budget=MAX_BUDGET, 
                       max_targets=MAX_TARGETS, 
                       num_classes=NUM_CLASSES, 
                       player_fov=PLAYER_FOV, 
                       render_mode=RENDER_MODE, 
                       action_format=ACTION_FORMAT))

# Setup callbacks
auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=config["logdir"], save_dir=config["savedir"], model_name="ppo", verbose=0)

# # Load the model from checkpoint 6M
# model_ppo = PPO.load(f"{config['savedir']}/MlpPolicy_PPO_step8000000", env_ppo)
# Create the agent
model_ppo = PPO(config["policy"], env_ppo, tensorboard_log=config["logdir"], verbose=0)

# Train in tranches
times_trained = 0
num_tranches = 10 * (1+times_trained)
# SHOULD BE: range(10 * times_trained, num_tranches):
for i in range(10 * times_trained, num_tranches+1):
    print(f"^ Tranch {i}/{num_tranches}")
    model_ppo.learn(config["total_timesteps"]//num_tranches, tb_log_name="PPO", callback=auto_save_callback, progress_bar=True, reset_num_timesteps=False)
    model_ppo.save(f"{config['savedir']}/{config['policy']}_PPO_step{i * (config['total_timesteps']//num_tranches)}")

Output()

^ Tranch 0/10


Output()

^ Tranch 1/10


Output()

^ Tranch 2/10


Output()

^ Tranch 3/10


Output()

^ Tranch 4/10


Output()

^ Tranch 5/10


Output()

^ Tranch 6/10


Output()

^ Tranch 7/10


Output()

^ Tranch 8/10


Output()

^ Tranch 9/10


Output()

^ Tranch 10/10


### Train SAC

In [12]:
# Instantiate and wrap the env
env_sac = make_vec_env(SimpleSimGym, 
                   n_envs=1, 
                   monitor_dir=config["logdir"]+"sac", 
                   env_kwargs=dict(
                       max_budget=MAX_BUDGET, 
                       max_targets=MAX_TARGETS, 
                       num_classes=NUM_CLASSES, 
                       player_fov=PLAYER_FOV, 
                       render_mode=RENDER_MODE, 
                       action_format=ACTION_FORMAT))

# Setup callbacks
auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=config["logdir"], save_dir=config["savedir"], model_name="sac", verbose=0)

# # Load the model from checkpoint 6M
# model_sac = SAC.load(f"{config['savedir']}/MlpPolicy_SAC_step8000000", env_sac)
# Create the agent
model_sac = SAC(config["policy"], env_sac, tensorboard_log=config["logdir"], verbose=0)

# Train in tranches
times_trained = 0
num_tranches = 10 * (1+times_trained)
# SHOULD BE: range(10 * times_trained, num_tranches):
for i in range(10 * times_trained, num_tranches+1):
    print(f"^ Tranch {i}/{num_tranches}")
    if i == 10 * times_trained:
        model_sac.learn(config["total_timesteps"]//num_tranches, tb_log_name="SAC", callback=auto_save_callback, progress_bar=True, reset_num_timesteps=True)
    else:
        model_sac.learn(config["total_timesteps"]//num_tranches, tb_log_name="SAC", callback=auto_save_callback, progress_bar=True, reset_num_timesteps=False)
    model_sac.save(f"{config['savedir']}/{config['policy']}_SAC_step{i * (config['total_timesteps']//num_tranches)}")

Output()

^ Tranch 0/10


Output()

^ Tranch 1/10


Output()

^ Tranch 2/10


Output()

^ Tranch 3/10


Output()

^ Tranch 4/10


Output()

^ Tranch 5/10


Output()

^ Tranch 6/10


Output()

^ Tranch 7/10


Output()

^ Tranch 8/10


Output()

^ Tranch 9/10


Output()

^ Tranch 10/10


### Continue Training or Run Dupliacte Experiments?

This cell can be used to either continue training on an existing model (use `reset_num_timesteps=False`) or to run additional duplicate experiments training from scratch to test training consistency

In [13]:
# model_sac.learn(config["total_timesteps"], tb_log_name="SAC", callback=auto_save_callback, progress_bar=True, reset_num_timesteps=False)
# model_sac.save(f"{config['savedir']}/{config['policy']}_SAC_pt2")

### Load Model?

In [14]:
# Load the best model
# model_sac = SAC.load(f"{config['savedir']}/MlpPolicy_SAC_step4000000")
# model_ppo = PPO.load(f"{config['savedir']}/MlpPolicy_PPO_step4000000")

# best_sac = SAC.load(f"{config['savedir']}/best_sac")
# best_ppo = PPO.load(f"{config['savedir']}/best_ppo")

### Check Performance

Check if the policy can consistently succeed in the environment over multilpe episodes.

In [15]:
# # Instantiate the eval env
# eval_env = make_vec_env(SimpleSimGym, 
#                    n_envs=1, 
#                    monitor_dir=config["logdir"], 
#                    env_kwargs=dict(
#                        max_budget=MAX_BUDGET, 
#                        max_targets=MAX_TARGETS, 
#                        num_classes=NUM_CLASSES, 
#                        player_fov=PLAYER_FOV, 
#                        render_mode=RENDER_MODE, 
#                        action_format=ACTION_FORMAT
#                    )
#                   )

# from stable_baselines3.common.evaluation import evaluate_policy

# # Check performance of best vs last model
# models = {"last_sac": model_sac, "best_sac": best_sac}#, "last_ppo": model_ppo, "best_ppo": best_ppo}
# # models = {"best_sac": best_sac, "best_ppo": best_ppo}

# for key in models.keys():
#     # Reset the eval env
#     eval_env.reset()
#     # Test average reward over multiple episodes
#     mean_reward, std_reward = evaluate_policy(models[key], eval_env, n_eval_episodes=50)
#     print(f"MODEL TYPE: {key}")
#     print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}\n")

### Prepare Video Recording

In [16]:
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

import base64
from pathlib import Path

from IPython import display as ipythondisplay


def show_videos(video_path="", prefix=""):
    """
    Taken from https://github.com/eleurent/highway-env

    :param video_path: (str) Path to the folder containing videos
    :param prefix: (str) Filter the video, showing only the only starting with this prefix
    """
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            """<video alt="{}" autoplay
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>""".format(
                mp4, video_b64.decode("ascii")
            )
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))


from stable_baselines3.common.vec_env import VecVideoRecorder #, DummyVecEnv

# # Create videos dir
# videos_dir = "./videos/"
# os.makedirs(videos_dir, exist_ok=True)

def record_video(eval_env, model, video_length=500, prefix="", video_folder="videos/"):
    """
    :param eval_env: (vec env)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """
    # eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )

    obs = eval_env.reset()
    
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()

### Visualize Trained Agent with Video


In [17]:
# record_video(eval_env, model_sac, video_length=500*3, prefix="sac-last-simplesim")
# show_videos("videos", prefix="sac-last")

# record_video(eval_env, best_sac, video_length=500*3, prefix="sac-best-simplesim")
# show_videos("videos", prefix="sac-best")

In [18]:
# record_video(eval_env, model_ppo, video_length=500*3, prefix="ppo-last-simplesim")
# show_videos("videos", prefix="ppo-last")

# record_video(eval_env, best_ppo, video_length=500*3, prefix="ppo-best-simplesim")
# show_videos("videos", prefix="ppo-best")