# Ship Sim Gym

Quick overview of training on a custom environment as it is slightly different from a normal one. 

Most cells contain code that is simply imported from corresponding py files. It is usually possible to call those scripts directly via their main hooks as well as long as you call them as modules `python -m ...`

If you change something in the scripts themselves and would like to see the cell updated here as well, uncomment the first line in the cell that does the IPython magic called %load. 

In [2]:
# This makes sure we have access to the top level directory and so the containing modules
import os
import sys

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

# The Ship Game

This is a basic top down ship simulator built with pygame for the graphics and uses pymunk for physics.

In [None]:
# %load ../ship_gym/game.py
import os
import random
import sys
import time

import numpy as np
import pygame

from pymunk import Vec2d, Transform
import pymunk as pm
import pymunk.pygame_util

from ship_gym import game_map
from ship_gym.config import GameConfig
from ship_gym.models import GameObject, Ship, PolyEnv, LiDAR

N_GOALS = 5
DEFAULT_BOUNDS = (500, 500)


class ShipGame(object):

    ships = list()
    goals = list()

    frame_counter = 0
    base_dt = 0.1
    colliding = False
    observe_mode = False
    record = False

    def __init__(self, game_config=None):

        if game_config is None:
            game_config = GameConfig

        self.speed = game_config.SPEED
        self.fps = game_config.FPS
        self.bounds = game_config.BOUNDS
        self.screen = pygame.display.set_mode(self.bounds)
        self.clock = pygame.time.Clock()
        self.goal_reached = False
        self.colliding = False

        self.debug_mode = game_config.DEBUG

        pygame.init()
        pygame.display.set_caption("Ship Sim Gym")
        pygame.key.set_repeat(10, 10)

        print("-"*30)
        print("SHIP GAME INITIALIZED")
        print("DEBUG MODE = ", self.debug_mode)
        print("GAME SPEED = ", self.speed)
        print("GAME FPS   = ", self.fps)
        print("-"*30, "\n")

        self.reset()

    def gen_level(self):
        """
        Generate a level on the fly by calling game map gen river poly function wrapping them in a GeoMap object
        and adding the generated pymunk primitives (shapes and bodies) to the game space
        :return:
        """
        poly = game_map.gen_river_poly(self.bounds)

        self.level = PolyEnv(poly, self.bounds)

        for body, shape in zip(self.level.bodies, self.level.shapes):
            self.space.add(body, shape)

    def invert_p(self, p):
        """Because in screen Y=0 is at the top or some shit like that """
        return Vec2d(p[0], self.bounds[1] - p[1])

    def add_goal(self, x, y):
        """Add a ball to the given space at a random position """
        self.total_goals += 1

        mass = 1
        radius = 5
        inertia = pm.moment_for_circle(mass, 0, radius, (0,0))
        body = pm.Body(mass, inertia)

        body.position = x, y
        shape = pm.Circle(body, radius, (0,0))
        shape.color = pygame.color.THECOLORS["green"]
        self.space.add(body, shape)
        shape.collision_type = 2

        goal = GameObject(body, shape)
        self.goals.append(goal)

        return goal

    def add_player_ship(self, x, y, width, height, color):
        """
        Call this after you have created the level!
        Creates a new Ship instance and adds a shape and body to the pymunk space
        :param self:
        :param x:
        :param y:
        :param width:
        :param height:
        :param color:
        :return:
        """

        ship = Ship(x, y, width, height, color)
        ship.add_lidar(self.level.shapes)

        self.space.add(ship.body, ship.shape)

        return ship

    def add_ship(self, x, y, width, height, color):
        """
        Creates a new Ship instance and adds a shape and body to the pymunk space
        :param self:
        :param x:
        :param y:
        :param width:
        :param height:
        :param color:
        :return:
        """
        ship = Ship(x, y, width, height, color)
        self.space.add(ship.body, ship.shape)

        return ship

    def get_screen(self):
        """
        Returns the game's screen space buffer as a 3D (color) array
        :return:
        """
        return pygame.surfarray.array3d(self.screen)

    def handle_discrete_action(self, action):
        """
        Handle discrete actions: It is possible to move forward, rotate left and right and do nothing.
        Moving backwards is not possible, but is easy to add if needed. See the player definition in models for this.
        :param action: integer value to indicate the action to take
        """
        if action == 0:
            self.player.move_forward()
        elif action == 1:
            self.player.rotate(-5)
        elif action == 2:
            self.player.rotate(+5)
        elif action == 3:
            pass

    def handle_input(self):
        """
        Maps key inputs to actions (via handle_discrete_action) and other utility functions such as quit
        """

         # Handle key strokes
        for event in pygame.event.get():

            # print(event.key)
            if event.type == pygame.QUIT:
                sys.exit(0)

            if event.type == pygame.KEYDOWN:
                if event.key == pygame.K_ESCAPE or event.key == pygame.K_q:
                    sys.exit(0)

                elif event.key == pygame.K_w:
                    self.handle_discrete_action(0)
                    print("W pressed. ")
                elif event.key == pygame.K_s:
                    print("S pressed. Button not configured")
                    # self.handle_discrete_action(1)
                elif event.key == pygame.K_a:
                    self.handle_discrete_action(1)
                    print("A pressed. ", self.player.rudder_angle)
                elif event.key == pygame.K_d:
                    self.handle_discrete_action(2)
                    print("D pressed. ", self.player.rudder_angle)


    def update(self):
        """
        The main update loop, resets certain event states, handles input, sensor routines and updates the game's
        pymunk space
        """
        self.colliding = False
        self.goal_reached = False
        self.handle_input()
        self.player.query_sensors()
        self.space.step(self.speed * self.base_dt)
        self.clock.tick(self.fps)

    def render(self):
        """
        The main render loop clears the screen and draws primitives if requested
        """
        self.screen.fill((0, 0, 200))
        if self.debug_mode:
            options = pm.pygame_util.DrawOptions(self.screen)
            options.flags = pymunk.SpaceDebugDrawOptions.DRAW_SHAPES
            self.space.debug_draw(options)

            res = self.player.lidar.query_results
            for r in res:
                if r is not None and r.shape is None:
                    p = r.point
                    p = self.invert_p(p)
                    p = (round(p.x), round(p.y))

                    # Green circle indicating the rays did not hot anything
                    pygame.draw.circle(self.screen, (0, 255, 0), p, 10)
                else:
                    p = r.point
                    p = self.invert_p(p)
                    p = (round(p.x), round(p.y))

                    # Red circle
                    pygame.draw.circle(self.screen, (255, 0, 0), p, 10)

        p = self.invert_p(self.player.position)

        pygame.draw.circle(self.screen, (255, 255, 0), (round(p.x), round(p.y)), 10)
        pygame.display.flip()

        self.frame_counter += 1


    def collide_ship(self, arbiter, space, data):
        """
        Ship collision callback for when the player ship hits another ship. All params are ignored at this point
        :param arbiter:
        :param space:
        :param data:
        :return:
        """
        self.colliding = True
        return True

    def collide_goal(self, arbiter, space, data):
        """
        Ship collision callback for when the player ship hits a goal object. All params are ignored at this point
        :param arbiter:
        :param space:
        :param data:
        :return:
        """
        shape = arbiter.shapes[1]
        space.remove(shape, shape.body)

        self.goal_reached = True
        self.goals = [g for g in self.goals if g.body is not shape.body]

        return False


    def reset(self):
        """
        Reset the game. Create the environment, the player and the goals
        :param spawn_point:
        :return:
        """
        self.total_goals = 0
        self.ships = list()
        self.goals = list()
        self.space = pm.Space()
        self.space.damping = 0.4
        self.create_environment()
        self.gen_goal_path(N_GOALS)

        spawn_point = Vec2d(self.bounds[0] / 2, 25)
        self.player = self.add_player_ship(spawn_point.x, spawn_point.y, 2, 3, pygame.color.THECOLORS["white"])
        self.player.shape.collision_type = 0
        self.setup_collision_handlers()

    def add_default_traffic(self):
        """
        Add some simple static traffic to the game
        :return:
        """
        self.ships.append(self.add_ship(100, 200, 1, 1, pygame.color.THECOLORS["black"]))
        self.ships.append(self.add_ship(300, 200, 1.5, 2, pygame.color.THECOLORS["black"]))
        self.ships.append(self.add_ship(400, 350, 1, 3, pygame.color.THECOLORS["black"]))

    def setup_collision_handlers(self):
        """
        Add collision handlers to the game space for goal and obstacle interactions.
        """
        h = self.space.add_collision_handler(0, 1)
        h.begin = self.collide_ship

        goal_agent_col = self.space.add_collision_handler(0, 2)
        goal_agent_col.begin = self.collide_goal

        self.space.add_collision_handler(0, 3)

    def gen_goal_path(self, n):
        """
        Generate a path of goals by sampling somewhat randomly the coordinate space. To avoid complete randomness
        where it is hard to even see a path, I kind of use a jittery approach where delta_y is computed according to
        the game bounds and incremented, and randomly jittered. The X position is determined by taking a jittered
        point close to the midline and doing a segment query to the left and right on environmental level shapes (see
        ShapeFilter). These points are then used as extreme points between which the X value is determined according
        to some tolerance value.
        :param n: number of goals to generate
        """

        y_delta = self.bounds[1] / (n+1)
        x_middle = self.bounds[0] / 2
        x_jitter = 50
        y_jitter = 20

        tolerance = 60
        filter = pymunk.ShapeFilter(mask=pymunk.ShapeFilter.ALL_MASKS ^ 0b1) # This has not been properly tested!

        for i in range(1, n+1):
            y = y_delta*i + random.randint(-y_jitter, y_jitter)
            try:
                left_ret = self.space.segment_query((self.bounds[0]/2, y), (0, y), 10, filter)[0]
                right_ret = self.space.segment_query((self.bounds[0] / 2, y), (self.bounds[0], y), 10, filter)[0]

                x = np.random.uniform(left_ret.point.x + tolerance, right_ret.point.x - tolerance)
                self.add_goal(x, y)

            except Exception as e:
                x = x_middle * i + random.randint(-x_jitter, x_jitter)
                self.add_goal(x, y)


    def closest_goal(self):
        """
        Return the goal with the smallest Euclidean distance to the player. Returns None if there are no goals left.
        :return:
        """
        if len(self.goals):
            min_goal = self.goals[0]
            min_distance = min_goal.body.position.get_distance(self.player.body.position)
            for goal in self.goals[1:]:

                dist = goal.body.position.get_distance(self.player.body.position)
                if dist < min_distance:
                    min_distance = dist
                    min_goal = goal

            return min_goal
        return None

    def create_environment(self):
        """
        The hook for creating the environment. Replace the call for gen_level
        :return:
        """
        self.gen_level()


def main():

    import os

    cwd = os.getcwd()
    gc = GameConfig
    gc.SPEED = 1
    gc.FPS = 30
    gc.DEBUG = True

    g = ShipGame()

    while True:
        g.update()
        g.render()



if __name__ == '__main__':
    main()


pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html
Loading chipmunk for Linux (64bit) [/home/simons/.miniconda/envs/ship-sim-gym-0.0.1/lib/python3.6/site-packages/pymunk/libchipmunk.so]
------------------------------
SHIP GAME INITIALIZED
DEBUG MODE =  True
GAME SPEED =  1
GAME FPS   =  30
------------------------------ 

W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
A pressed.  -5
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
A pressed.  -10
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
D pressed.  -5
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
W pressed. 
D pressed.  0
D pressed.  5
D pressed.  10
D pressed.  10
D pressed.  10


## OpenAI Gym Environment

This creates a wrapper for the ship game I made

The way you create an OpenAI gym is by defining a class that overrides the gym.Env class and override a few properties:

```
action_space
observation_space
```

and a few functions:

```
step()
reset()
render()
```

The step function is the most imporant one: It takes an action parameter, a single digit for a discrete action space or a vector of values for continuous action spaces. In our case we use a single digit to indicate what action to take:

- 0 = move forward
- 1 = rotate thruster left
- 2 = rotate thruster right

After every step new states are returned, again this depends on your observation_space, but in our case the layout is as follows:

`Px Py R Gx Gy L1 L2 ... Ln`

Where

- P is the player position
- A is the player angle
- R is the rudder angle
- G is the nearest goal position
- L are the lidar values
- N is the number of rays lidar uses

Find more information in the code below. It's important to note that the states are critically important. If you are making your own environment you would want to test them well. If the states are not configured well it's not weird the AI doesnt learn anything. 

`reset()` resets the environment like respawning the player ship and generating a new environment. It also returns the states as observed at reset.

`render()` is used mostly to get additional debug information. You could for example not render the game to screen, but use a `render()` function explicitly to render only when you want to.

In [1]:
# %load ../ship_gym/ship_env.py
import sys
from collections import deque

import numpy as np

from gym import Env
from gym.spaces import Box, Discrete
from gym.utils import seeding

from ship_gym.game import ShipGame

DEFAULT_STATE_VAL = -1
STEP_PENALTY = -0.01


class ShipEnv(Env):

    metadata = {'render.modes': ['human', 'rgb_array']}
    action_space = Discrete(3)
    reward_range = (-1, 1)

    # TODO: Derive the discrete actions
    def __init__(self, game_config, env_config):

        # TODO: Should add some basic sanity checks (max_steps > 0 etc.)
        self.last_action = None
        self.reward = 0
        self.cumulative_reward = 0
        self.step_count = 0
        self.env_config = env_config

        self.game = ShipGame(game_config)
        self.episodes_count = -1 # Because the first reset will increment it to 0


        """P is the player position
        A is the player angle
        R is the rudder angle
        G is the nearest goal position
        L are the lidar values
        N is the number of rays lidar uses
        """
        self.n_states = 2 + 1 + 1 + 2 + self.game.player.lidar.n_beams
        self.states_history = self.n_states * self.env_config.HISTORY_SIZE

        if self.env_config.HISTORY_SIZE < 1:
            raise ValueError("history_size must be greater than zero")
        self.observation_space = Box(low=0, high=max(self.game.bounds), shape=(self.states_history,), dtype=np.uint8)

        # print(" *** SHIP-GYM INITIALIZED *** ")

    def seed(self, seed=None):
        """
        Seed numpy random generator
        :param seed: the seed to use
        """
        self.np_random, seed = seeding.np_random(seed)
        # Important to actually seed it!!! I thought above would work but it's not enough
        np.random.seed(seed)
        return [seed]

    def determine_reward(self):
        """
        Determines the reward of the current timestep
        """
        if self.game.colliding:
            self.reward = -1.0
        if self.game.goal_reached:
            self.reward = 1.0

        # TODO: Code duplication with is_done()
        elif self.game.player.x < 0 or self.game.player.x > self.game.bounds[0]:
            self.reward = -1
        elif self.game.player.y < 0 or self.game.player.y > self.game.bounds[1]:
            self.reward = -1
        else:
            self.reward = STEP_PENALTY  # Small penalty

    def __add_states(self):
        '''
        Push back some new state information for the current timestep onto the FIFO queue for all history timesteps
        it keeps track of.

        Layout of a single time step state is like this:

        Px Py R Gx Gy L1 L2 ... Ln

        Where
        P is the player position
        A is the player angle
        R is the rudder angle
        G is the nearest goal position
        L are the lidar values
        N is the number of rays lidar uses

        :return: the complete history buffer of states extended with the most recent one
        '''

        states = self.n_states * [-1]

        # Myself
        goal = self.game.closest_goal()
        goal_pos = [-1, -1]
        player = self.game.player

        if goal:
            goal_pos = [goal.body.position.x, goal.body.position.y]
        states[:6] = [player.x, player.y, player.rudder_angle, player.body.angle, goal_pos[0], goal_pos[1]]

        lidar_vals = self.game.player.lidar.vals

        states[6:] = lidar_vals
        self.states.extend(states)

    def is_done(self):
        """
        Determines whether the episode has finished based on collisions and goals reached.
        :return:
        """
        if self.game.colliding:
            return True
        elif len(self.game.goals) == 0:
            return True

        player = self.game.player
        if player.x < 0 or player.x > self.game.bounds[0]:
            return True
        elif player.y < 0 or player.y > self.game.bounds[1]:
            return True

        if self.step_count >= self.env_config.MAX_STEPS:
            return True

        return False

    def step(self, action):
        """

        :param action:
        :return:
        """

        assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action))
        self.game.handle_discrete_action(action)

        self.game.update()
        self.game.render()

        self.determine_reward()
        self.cumulative_reward += self.reward
        self.__add_states()
        self.step_count += 1

        done = self.is_done()

        return np.array(self.states), self.reward, done, {}

    def render(self, mode='human', close=False):
        """
        Display additional debug information about the state of the environment here. You might also render actual images
        or videos from the game's frame buffer (not currently implemented)
        :param mode: the display mode, currently ignored but might be use to visualise different ways
        :param close: Whether to close the environment. Also ignored in the current version
        """
        out = sys.stdout

        if self.last_action is not None:
            out.write(f'action={self.last_action}, cumm_reward={self.cumulative_reward}')


    def reset(self):
        self.game.reset()
        self.last_action = None
        self.reward = 0
        self.cumulative_reward = 0
        self.step_count = 0
        self.episodes_count += 1

        # Setup states
        n = self.n_states * self.env_config.HISTORY_SIZE
        self.states = deque([DEFAULT_STATE_VAL] * n, maxlen=n)
        self.__add_states()

        return np.array(self.states)


ModuleNotFoundError: No module named 'ship_gym'

## Random Agent

A simple baseline to compare against is a random agent. Note that we run it at a slow speed so you can easily observe the behavior. Try changing FPS and SPEED for funsies.

In [None]:
# %load ../train/random.py
import random
import time

import gym
import ship_gym
import numpy as np
from ship_gym.ship_env import ShipEnv

from ship_gym.config import EnvConfig, GameConfig

gc = GameConfig
gc.DEBUG = True
gc.SPEED = 1
gc.FPS = 30

env = ShipEnv(game_config=gc, env_config=EnvConfig)

env.reset()
cont = True

for _ in range(1000):

    total_reward = 0
    for _ in range(100):
        env.render()

        ret = env.step(env.action_space.sample())  # take a random action

        print(ret)
        total_reward += ret[1]
        if ret[2] == True:
            print(f"AGENT IS DONE. TOTAL REWARD = {total_reward}")
            env.reset()
            break

        


## Stable Baselines

[Stable baselines](https://github.com/hill-a/stable-baselines) is a nice fork of Open AI baselines. It's better documented, has nicer code and better more intuitive methods of training. Still it's pretty low level when compared to rllib, but that also gives you some flexibility

Read more about it in the [docs](https://stable-baselines.readthedocs.io)

In [None]:
# %load ../train/stable_baselines/ppo.py
import multiprocessing
import os
import time
import sys

import numpy as np
from baselines.results_plotter import ts2xy
from stable_baselines.bench import load_results, Monitor
from stable_baselines.common import set_global_seeds
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import PPO2, ACER

from ship_gym.config import EnvConfig, GameConfig
from ship_gym.ship_env import ShipEnv
from datetime import datetime

from tqdm import tqdm


log_dir = "logs/learning"
model_dir = "models"
log_step_interval = 10000

def callback(_locals, _globals):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global n_steps, best_mean_reward, t_last
    # Print stats every 1000 calls
    t = time.time()

    if (n_steps + 1) % log_step_interval == 0:

        # Evaluate policy performance
        x, y = ts2xy(load_results(log_dir), 'timesteps')
        if len(x) > 0:
            mean_reward = np.mean(y[-100:])
            print(x[-1], 'timesteps')
            print(
                "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                _locals['self'].save(log_dir + 'best_model.pkl')
    n_steps += 1
    return False

def make_env():
        """
        Utility function for multiprocessed env.

        :param n_goals:
        :param env_id: (str) the environment ID
        :param num_env: (int) the number of environment you wish to have in subprocesses
        :param seed: (int) the inital seed for RNG
        :param rank: (int) index of the subprocess
        """

        game_config = GameConfig
        game_config.FPS = 1000
        game_config.SPEED = 30
        game_config.DEBUG = True # This will render more primitives to make it more observable for humans (you), although this is not necessary for training and incurs a small performance hit
        game_config.BOUNDS = (1000, 1000)

        def _init():
            env_config = EnvConfig
            env = ShipEnv(game_config, env_config)
            return env

        return _init

def get_model_path(lr):
    return os.path.join(model_dir, f"result_lr{lr}")


tb_root_dir = os.path.join(log_dir, "tb", str(int(time.time())))

def train(model_cls, tid, env, lr, steps):

    start_t = time.time()
    tb_dir = os.path.join(tb_root_dir, f"{tid}_{model_cls.__name__}")
    model = model_cls(MlpPolicy, env, learning_rate=lr, verbose=1, tensorboard_log=tb_dir)

    model.learn(total_timesteps=steps, log_interval=10000)

    end_t = time.time()
    elapsed = end_t - start_t

    print(f"Trained {steps} steps in {elapsed} seconds")
    print(f"Speed = {steps / (elapsed / 60)} steps/min")
    print()

    path = get_model_path(lr)
    model.save(path)

def main():

    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    np.set_printoptions(suppress=True)

    ''' SET UP YOUR (HYPER)PARAMETERS HERE'''

    def make_lr_func(start, stop):

        def lr_func(frac):
            return start + (stop - start) * (1-frac)

        return lr_func

    lrs = [1.0e-3, 1.0e-4, 1.0e-5]
    lrs = [make_lr_func(s, 0) for s in lrs]

    # Setting it to the number of CPU's you have is usually optimal
    num_cpu = multiprocessing.cpu_count()
    env = SubprocVecEnv([make_env() for i in range(num_cpu)])

    i = 0
    steps = int(1e6)

    for lr in lrs:
        print(f"""
Started training at {datetime.now()}
------------------------------------------------
Training Steps 	:\t {steps}
Learning rate   :\t {lr} 
            """)

        i += 1
        train(PPO2, i, env, lr, steps)
        # train(ACER, env, n_goals, 0, lr, steps)

    print("*" * 30)
    print(" "*10,"     DONE!     ", " "*10)
    print(" ", datetime.now(), " ")
    print("*" * 30)


if __name__ == '__main__':
    main()

# RLLib

RLlib is one of the few frameworks that I found that actually attempts to solve the scaling up problem effectively. Stable baselines has some basic parallelization, but doesn't really deal with the issue of hyper-parameter search or scaling in terms of hardware / clusters.

RLLib is built on top of ray, which is a high-performance execution engine (sounds awesome right?). In fact this is used for their Tune framework as well which is a hyper parameter search and experimentation tool on top of good old deep learning implementations (but extended for RL as well)

I will show you one neat setup of RLLib with Tune that uses something called population based training. It basically runs experimments multiple times and then evaluates how well it performs, randomly resampling from a start distribution (**exploration**) or slighly perturbing a top performer (**exploitation**).

Because RL is very hyper-parameter sensitive you can imagine that such a tool would be very helpful. There is a lot more to explore, go visit their docs for [more](https://ray.readthedocs.io/en/latest/)

In [None]:
# %load ../train/rllib/pbt 
import random

import ray
from ray.tune import run_experiments, register_env

from ray.tune.schedulers import PopulationBasedTraining

from ship_gym.config import GameConfig, EnvConfig
from ship_gym.ship_env import ShipEnv

import multiprocessing

if __name__ == '__main__':

    game_config = GameConfig
    game_config.FPS = 1000
    game_config.SPEED = 30
    game_config.BOUNDS = (1000, 1000)

    def env_creator(env_config):

        env_config = EnvConfig
        env = ShipEnv(game_config, env_config)

        return env

    register_env("ShipGym-v1", env_creator)

    pbt = PopulationBasedTraining(
        time_attr="time_total_s",
        reward_attr="episode_reward_mean",
        perturbation_interval=600, # 10 mins
        resample_probability=0.33, # Should we start with a new config or modify a good performing one?

        # Specifies the mutations of these hyperparams
        hyperparam_mutations={
            "lambda": lambda: random.uniform(0.9, 1.0),
            "clip_param": lambda: random.uniform(0.01, 0.5),
            "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
            "num_sgd_iter": lambda: random.randint(1, 30),
            "sgd_minibatch_size": lambda: random.randint(128, 16384),
            "train_batch_size": lambda: random.randint(2000, 160000),
        })

    ray.init()
    
    run_experiments(
        {
            "pbt_ship_sim_v2": {
                "run": "PPO",
                "env": "ShipGym-v1",
                "num_samples": 120, # Repeat the experiment this many times
                "checkpoint_at_end" : True,
                "checkpoint_freq" : 2,
                "config": {
                    "kl_coeff": 1.0,
                    "num_workers": multiprocessing.cpu_count() - 1,
                    "num_gpus": 1,
                    # These params are tuned from a fixed starting value.
                    "lambda": 0.95,
                    "clip_param": 0.2,
                    "lr" : 5.0e-4,

                    # These params start off randomly drawn from a set.
                    "num_sgd_iter":
                        lambda spec: random.choice([10, 20, 30]),
                    "sgd_minibatch_size":
                        lambda spec: random.choice([128, 512, 2048]),
                    "train_batch_size":
                        lambda spec: random.choice([10000, 20000, 40000])
                },
            },
        },
        scheduler=pbt)

## Training Progress

Ray / RLlib records progress stats by default in a folder in the home directory called ray_results. It has some CSVs and if enabled is also able to save TensorBoard logs.

Run `tensorboard --logdir ~/ray_results` on Mac and Linux, not sure about Windows and go to the URL:
 
[http://localhost:6006/](http://localhost:6006/)

## PPO with RLlib

It's interesting to train a PPO agent similar to how we trained it with stable-baselines. Everything is a lot easier and much more declarative: simply stating an expirement definition is enough. If you'd prefer there are ways of training it in a more procedural way similar to stable-baselines.

The below example uses the declarative method. Note that I use an lr schedule below to decrease learning rate linearly over time.

In [None]:
# %load ../train/rllib/ppo.py
import ray

from ray import tune

from ship_gym.config import GameConfig, EnvConfig
from ship_gym.ship_env import ShipEnv

import multiprocessing

if __name__ == "__main__":

    game_config = GameConfig
    game_config.FPS = 100000
    game_config.SPEED = 40
    game_config.BOUNDS = (1000, 1000)

    ray.init(num_gpus=1)

    def env_creator():

        env_config = EnvConfig
        env = ShipEnv(game_config, env_config)

        return env

    experiments = {
        "shipgym_best": {
            "run": "PPO",
            "stop": {
                "time_total_s": 12 * 60 * 60 # 12 hours
            },
            "env": "ship-gym-v1",
            "config": {
                "num_gpus": 1,
                "num_workers" : multiprocessing.cpu_count() - 1,
                "num_sgd_iter" :  10,
                "sgd_minibatch_size" : 2048,
                "train_batch_size" : 10000,
                "lr_schedule" : [[0, 0.001], [5e6, 0.0001], [1e7, 0.00001]]
            },
        },
    }
    tune.register_env("ship-gym-v1", env_creator)
    tune.run_experiments(experiments)
