# SimpleSim Non-Holonomic Navigation Challenge

This notebook runs a comparison of learning algorithms, by training PPO and DQN models.

## Install Dependencies and Stable Baselines3 Using Pip

In [1]:
# !pip install "stable-baselines3[extra]>=2.0.0a4"

### Setup Tensorboard Logging

In [2]:
# # Clear any logs from previous runs
# !rm -rf ./logs/

# Load the TensorBoard notebook extension
%load_ext tensorboard

##  Custom Gym Envs

Below are a couple of simpler lower order gridworld type gym environments that can be used as testing and debugging examples ,as well as our main SimpleSIm non-holonomic driving environment (which is imported from the seperate source files env.py and env_gym.py)

In [3]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

# Import our main environment
from env_gym import SimpleSimGym

### Validate the environment

Stable Baselines3 provides a [helper](https://stable-baselines3.readthedocs.io/en/master/common/env_checker.html) to check that your environment follows the Gym interface. It also optionally checks that the environment is compatible with Stable-Baselines (and emits warning if necessary).

In [4]:
from stable_baselines3.common.env_checker import check_env

env = SimpleSimGym(max_budget=500, max_targets=3, num_classes=10, player_fov=60)

# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

Environment seed: 283


## Import Auto Saving of Best Model Callback

In [5]:
from utils import SaveOnBestTrainingRewardCallback

## Train The Model

In [6]:
from stable_baselines3 import PPO, SAC, DQN
from stable_baselines3.common.env_util import make_vec_env
import os

# Environment Parameters
MAX_BUDGET = 400
MAX_TARGETS = 5
NUM_CLASSES = 10
PLAYER_FOV = 30
RENDER_MODE = "rgb_array"
ACTION_FORMAT = "continuous"

config = {
    "policy": 'MlpPolicy',
    "total_timesteps": 4_000_000,
    "logdir": "logs/",
    "savedir": "saved_models/",
}

# Create log dir
os.makedirs(config["logdir"], exist_ok=True)

# Create save dir
os.makedirs(config["savedir"], exist_ok=True)

# Show Tensorboard Logs
Visualise the live logs on tensorboard as we train

In [7]:
# Open Tensorboard Logging
%tensorboard --logdir logs/ --reload_interval 30 --port=6008

ERROR: Failed to launch TensorBoard (exited with 1).
Contents of stderr:
TensorFlow installation not found - running with reduced feature set.
Address already in use
Port 6007 is in use by another program. Either identify and stop that program, or start the server with a different port.

### Train PPO

In [8]:
# Instantiate and wrap the env
env_ppo = make_vec_env(SimpleSimGym, 
                   n_envs=1, 
                   monitor_dir=config["logdir"]+"ppo", 
                   env_kwargs=dict(
                       max_budget=MAX_BUDGET, 
                       max_targets=MAX_TARGETS, 
                       num_classes=NUM_CLASSES, 
                       player_fov=PLAYER_FOV, 
                       render_mode=RENDER_MODE, 
                       action_format="continuous"))

# Setup callbacks
# ppo_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=config["logdir"], save_dir=config["savedir"], verbose=0)
ppo_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=config["logdir"], save_dir=config["savedir"], model_name="ppo", verbose=0)

# # Load the model from checkpoint 2
# model_ppo = PPO.load(f"{config['savedir']}/MlpPolicy_PPO_step2000000", env_ppo)

# Create the agent
model_ppo = PPO(config["policy"], env_ppo, tensorboard_log=config["logdir"], verbose=0)

# # Train the agent
# model_ppo.learn(config["total_timesteps"], tb_log_name="PPO", callback=ppo_save_callback, progress_bar=True)
# model_ppo.save(f"{config['savedir']}/{config['policy']}_PPO")

# Train in tranches
times_trained = 0
num_tranches = 10
tranches = num_tranches*(1+times_trained)

for i in range((num_tranches * times_trained)+1, tranches+1):
    print(f"^ Tranch {i}/{tranches}")
    if i == 1:
        model_ppo.learn(config["total_timesteps"]//tranches, tb_log_name="PPO", callback=ppo_save_callback, progress_bar=True, reset_num_timesteps=True)
    else:
        model_ppo.learn(config["total_timesteps"]//tranches, tb_log_name="PPO", callback=ppo_save_callback, progress_bar=True, reset_num_timesteps=False)
    model_ppo.save(f"{config['savedir']}/{config['policy']}_PPO_step{i * (config['total_timesteps']//tranches)}")

Environment seed: 800


Output()

^ Tranch 1/10


Output()

^ Tranch 2/10


Output()

^ Tranch 3/10


Output()

^ Tranch 4/10


Output()

^ Tranch 5/10


Output()

^ Tranch 6/10


Output()

^ Tranch 7/10


Output()

^ Tranch 8/10


Output()

^ Tranch 9/10


Output()

^ Tranch 10/10


### Train DQN

In [9]:
# Instantiate and wrap the env
env_dqn = make_vec_env(SimpleSimGym, 
                   n_envs=1, 
                   monitor_dir=config["logdir"]+"dqn", 
                   env_kwargs=dict(
                       max_budget=MAX_BUDGET, 
                       max_targets=MAX_TARGETS, 
                       num_classes=NUM_CLASSES, 
                       player_fov=PLAYER_FOV, 
                       render_mode=RENDER_MODE, 
                       action_format="discrete"))

# Setup callbacks
dqn_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=config["logdir"], save_dir=config["savedir"], model_name="dqn", verbose=0)

# # Load the model from checkpoint 6M
# model_dqn = DQN.load(f"{config['savedir']}/MlpPolicy_DQN_step8000000", env_dqn)
# Create the agent
model_dqn = DQN(config["policy"], env_dqn, tensorboard_log=config["logdir"], verbose=0)

# # # Train the agent
# # model_dqn.learn(config["total_timesteps"], tb_log_name="DQN", callback=dqn_save_callback, progress_bar=True)
# # model_dqn.save(f"{config['savedir']}/{config['policy']}_DQN")

# Train in tranches
times_trained = 0
num_tranches = 10
tranches = num_tranches*(1+times_trained)

for i in range((num_tranches * times_trained)+1, tranches+1):
    print(f"^ Tranch {i}/{tranches}")
    if i == 1:
        model_dqn.learn(config["total_timesteps"]//tranches, tb_log_name="DQN", callback=dqn_save_callback, progress_bar=True, reset_num_timesteps=True)
    else:
        model_dqn.learn(config["total_timesteps"]//tranches, tb_log_name="DQN", callback=dqn_save_callback, progress_bar=True, reset_num_timesteps=False)
    model_dqn.save(f"{config['savedir']}/{config['policy']}_DQN_step{i * (config['total_timesteps']//tranches)}")

Output()

Output()

^ Tranch 2/10


Output()

^ Tranch 3/10


Output()

^ Tranch 4/10


Output()

^ Tranch 5/10


Output()

^ Tranch 6/10


Output()

^ Tranch 7/10


Output()

^ Tranch 8/10


Output()

^ Tranch 9/10


Output()

^ Tranch 10/10


### Continue Training or Run Dupliacte Experiments?

This cell can be used to either continue training on an existing model (use `reset_num_timesteps=False`) or to run additional duplicate experiments training from scratch to test training consistency

In [10]:
# model_sac.learn(config["total_timesteps"], tb_log_name="SAC", callback=auto_save_callback, progress_bar=True, reset_num_timesteps=False)
# model_sac.save(f"{config['savedir']}/{config['policy']}_SAC_pt2")

### Load Model?

In [11]:
# Load the best model
best_ppo = PPO.load(f"{config['savedir']}/best_ppo")
best_dqn = DQN.load(f"{config['savedir']}/best_dqn")

### Check Performance

Check if the policy can consistently succeed in the environment over multilpe episodes.

In [12]:
from stable_baselines3.common.evaluation import evaluate_policy

# Instantiate the eval env
eval_env = make_vec_env(SimpleSimGym, 
                   n_envs=1, 
                   # monitor_dir=config["logdir"], 
                   env_kwargs=dict(
                       max_budget=MAX_BUDGET, 
                       max_targets=MAX_TARGETS, 
                       num_classes=NUM_CLASSES, 
                       player_fov=PLAYER_FOV, 
                       render_mode=RENDER_MODE, 
                       action_format=ACTION_FORMAT
                   )
                  )

# Check performance of best vs last model
models = {"last_sac": model_sac, "best_sac": best_sac}#, "last_ppo": model_ppo, "best_ppo": best_ppo}
# models = {"best_sac": best_sac, "best_ppo": best_ppo}

for key in models.keys():
    # Reset the eval env
    eval_env.reset()
    # Test average reward over multiple episodes
    mean_reward, std_reward = evaluate_policy(models[key], eval_env, n_eval_episodes=50)
    print(f"MODEL TYPE: {key}")
    print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}\n")

Environment seed: 131


NameError: name 'model_sac' is not defined

### Visualize Trained Agent with Video


In [None]:
from utils import record_video, show_videos

record_video(eval_env, model_sac, video_length=500*3, prefix="jupyter-sac-last-simplesim")
show_videos("videos", prefix="jupyter-sac-last")

record_video(eval_env, best_sac, video_length=500*3, prefix="jupyter-sac-best-simplesim")
show_videos("videos", prefix="jupyter-sac-best")

In [None]:
# record_video(eval_env, model_ppo, video_length=500*3, prefix="jupyter-ppo-last-simplesim")
# show_videos("videos", prefix="jupyter-ppo-last")

# record_video(eval_env, best_ppo, video_length=500*3, prefix="jupyter-ppo-best-simplesim")
# show_videos("videos", prefix="jupyter-ppo-best")