# Ship Sim Gym

Quick overview of training on a custom environment as it is slightly different from a normal one


In [1]:
# Imports

import os
import sys

# This makes sure we have access to the top level directory and so the containing modules
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from ship_gym.config import EnvConfig, GameConfig
from collections import deque

import numpy as np

from gym import Env
from gym.spaces import Discrete, Box
from gym.utils import seeding

from pymunk import Vec2d

from ship_gym.curriculum import Curriculum
from ship_gym.game import ShipGame

Loading chipmunk for Darwin (64bit) [/Users/gerard/miniconda3/envs/ship-sim-gym-3.6/lib/python3.6/site-packages/pymunk/libchipmunk.dylib]
pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html


# The Ship Game

This is a basic top down ship simulator built with pygame for the graphics and uses pymunk for physics.

In [3]:
# The Game
# This doesn't do so well in Jupyter notebook. If it gives you issues run it from commandline

%run ../ship_gym/game.py

Init game at speed =  1
Init game at fps =  30


## OpenAI Gym Environment

This creates a wrapper for the ship game I made

The way you create an OpenAI gym is by defining a class that overrides the gym.Env class and override a few properties:

```
action_space
observation_space
```

and a few functions:

```
step()
reset()
render()
```

In [12]:
# Environment / Gym

DEFAULT_STATE_VAL = -1
STEP_PENALTY = -0.01

class ShipEnv(Env):

    metadata = {'render.modes': ['human', 'rgb_array']}
    action_space = Discrete(5)
    reward_range = (-1, 1)

    # TODO: Derive the discrete actions
    def __init__(self, game_config, env_config):

        # TODO: Should add some basic sanity checks (max_steps > 0 etc.)
        self.last_action = None
        self.last_action = None
        self.reward = 0
        self.cumulative_reward = 0
        self.step_count = 0
        self.env_config = env_config

        self.game = ShipGame(game_config)
        self.episodes_count = -1 # Because the first reset will increment it to 0
        self.n_states = 2 + 1 + 1 + 2 + self.game.player.lidar.n_beams
        self.states_history = self.n_states * self.env_config.HISTORY_SIZE

        if self.env_config.HISTORY_SIZE < 1:
            raise ValueError("history_size must be greater than zero")
        self.observation_space = Box(low=0, high=max(self.game.bounds), shape=(self.states_history,), dtype=np.uint8)

        # print(" *** SHIP-GYM INITIALIZED *** ")

    def seed(self, seed=None):
        """
        Small but extremely important function, this makes sure that every environment you create is slightly different
        otherwise parallelization is useless since the states will be exactly the same!
        """
        self.np_random, seed = seeding.np_random(seed)
        np.random.seed(seed)
        return [seed]

    def determine_reward(self):

        if self.game.colliding:
            self.reward = -1.0
        if self.game.goal_reached:
            self.reward = 1.0

        # TODO: Code duplication with is_done()
        elif self.game.player.x < 0 or self.game.player.x > self.game.bounds[0]:
            self.reward = -1
        elif self.game.player.y < 0 or self.game.player.y > self.game.bounds[1]:
            self.reward = -1
        else:
            self.reward = STEP_PENALTY  # Small penalty

    def _normalized_coords(self, x, y):
        return x / self.game.bounds[0], y / self.game.bounds[1]

    def __add_states(self):
        '''
        Push back some new state information for the current timestep onto the FIFO queue for all history timesteps
        it keeps track of.

        Layout of a single time step state is like this:

        Px Py R Gx Gy L1 L2 ... Ln

        Where
        P is the player position
        A is the player angle
        R is the rudder angle
        G is the nearest goal position
        L are the lidar values
        N is the number of rays lidar uses


        :return: the complete history buffer of states extended with the most recent one
        '''

        states = self.n_states * [-1]
        goal = self.game.closest_goal()
        goal_pos = [-1, -1]
        player = self.game.player

        if goal:
            goal_pos = [goal.body.position.x, goal.body.position.y]
        states[:6] = [player.x, player.y, player.rudder_angle, player.body.angle, goal_pos[0], goal_pos[1]]

        lidar_vals = self.game.player.lidar.vals

        states[6:] = lidar_vals
        self.states.extend(states)

    def is_done(self):
        if self.game.colliding:
            # print("OOPS --- COLLISION")
            return True
        elif len(self.game.goals) == 0:
            print("ALL GOALS REACHED! -- CUMULATIVE REWARD = ", self.cumulative_reward)
            return True

        player = self.game.player
        if player.x < 0 or player.x > self.game.bounds[0]:
            print("X out of bounds")
            return True
        elif player.y < 0 or player.y > self.game.bounds[1]:
            print("Y out of bounds")
            return True

        if self.step_count >= self.env_config.MAX_STEPS:
            print("MAX STEPS")
            return True

        return False

    def step(self, action):
        assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action))

        self.game.handle_action(action)
        self.game.update()
        self.game.render()

        self.determine_reward()
        self.cumulative_reward += self.reward
        self.__add_states()
        self.step_count += 1

        done = self.is_done()

        return np.array(self.states), self.reward, done, {}

    def render(self, mode='human', close=False):
        """
        This could be a rendered frame or just some stats that are used for debugging
        """
        out = sys.stdout

        if self.last_action is not None:
            out.write(f'action={self.last_action}, cum_reward={self.cumulative_reward}')

        return

    def reset(self):
        self.game.reset()

        self.last_action = None
        self.reward = 0
        self.cumulative_reward = 0
        self.step_count = 0
        self.episodes_count += 1

        n = self.n_states * self.env_config.HISTORY_SIZE
        self.states = deque([DEFAULT_STATE_VAL] * n, maxlen=n)
        self.__add_states()

        return np.array(self.states)


In [13]:
game_config = GameConfig
game_config.FPS = 1000
game_config.SPEED = 30
game_config.BOUNDS = (1000, 1000)

def make_env(rank, game_config, env_config, seed=0):
        """
        Utility function for multiprocessed env.

        :param n_goals:
        :param env_id: (str) the environment ID
        :param num_env: (int) the number of environment you wish to have in subprocesses
        :param seed: (int) the inital seed for RNG
        :param rank: (int) index of the subprocess
        """

        def _init():
            env_config = EnvConfig
            env = ShipEnv(game_config, env_config)
            return env

        return _init

## Random Agent

A simple baseline to compare against is a random agent

In [15]:
gc = GameConfig
gc.FPS = 10
gc.SPEED = 10
gc.BOUNDS = (400,400)

env = ShipEnv(game_config=gc, env_config=EnvConfig)
env.reset()

rewards = list()

for _ in range(1):

    episode_reward = 0
    for _ in range(1000):
        env.render()

        states, reward, done, _ = env.step(env.action_space.sample()) # take a random action
        # ret = env.step(0) # take a random action

        episode_reward += reward
        
        print("Reward = ", reward)
        print("States = ", states)
        
        if done == True:
            print(f"AGENT IS DONE. TOTAL REWARD = {episode_reward}")
            rewards.append(episode_reward)
            env.reset()
            break

Init game at speed =  10
Init game at fps =  10
Reward =  -0.01
States =  [200.          25.           0.           0.         187.19246125
  69.66666667  -1.          -1.          -1.          -1.
  -1.          -1.          -1.          -1.          -1.
  -1.         200.          25.         -10.           0.
 187.19246125  69.66666667  -1.          -1.          -1.
  -1.          -1.          -1.          -1.          -1.
  -1.          -1.        ]
Reward =  -0.01
States =  [200.          25.         -10.           0.         187.19246125
  69.66666667  -1.          -1.          -1.          -1.
  -1.          -1.          -1.          -1.          -1.
  -1.         200.          25.           0.           0.
 187.19246125  69.66666667  -1.          -1.          -1.
  -1.          -1.          -1.          -1.          -1.
  -1.          -1.        ]
Reward =  -0.01
States =  [200.          25.           0.           0.         187.19246125
  69.66666667  -1.          -1.         

In [16]:
game_config = GameConfig
game_config.FPS = 1000
game_config.SPEED = 30
game_config.BOUNDS = (1000, 1000)

def env_creator(env_config):

    env_config = EnvConfig
    env = ShipEnv(game_config, env_config)

    return env

register_env("ShipGym-v1", env_creator)

pbt = PopulationBasedTraining(
time_attr="time_total_s",
reward_attr="episode_reward_mean",
perturbation_interval=300, # 5 mins
resample_probability=0.25,

# Specifies the mutations of these hyperparams
hyperparam_mutations={
    "lambda": lambda: random.uniform(0.9, 1.0),
    "clip_param": lambda: random.uniform(0.01, 0.5),
    "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
    "num_sgd_iter": lambda: random.randint(1, 30),
    "sgd_minibatch_size": lambda: random.randint(128, 16384),
    "train_batch_size": lambda: random.randint(2000, 160000),
})

ray.init()

n_goals = 5
reward_done = .9*n_goals

run_experiments(
{
    "pbt_ship_sim": {
        "run": "PPO",
        "env": "ShipGym-v1",
        "num_samples": 12, # Repeat the experiment this many times
        "checkpoint_at_end" : True,
        "checkpoint_freq" : 10,
        "config": {
            "kl_coeff": 1.0,
            "num_workers": multiprocessing.cpu_count() - 1,
            "num_gpus": 1,
            
            # These params are tuned from a fixed starting value.
            "lambda": 0.95,
            "clip_param": 0.2,
            "lr" : 5.0e-4,
            "num_sgd_iter":
                lambda spec: random.choice([10, 20, 30]),
            "sgd_minibatch_size":
                lambda spec: random.choice([128, 512, 2048]),
            "train_batch_size":
                lambda spec: random.choice([10000, 20000, 40000])
        },
    },
}, scheduler=pbt) # Reference the scheduler

NameError: name 'register_env' is not defined

## Training Visualisation

rllib has very nice Tensorboard integration.

http://185.165.71.107:6006/

Insert TensorBoard image

SyntaxError: invalid syntax (<ipython-input-19-0b0dd02cf49b>, line 1)