## Waterworld Game in PLE environment

In [8]:
from ple.games.waterworld import WaterWorld

In [9]:
game = WaterWorld()

In [4]:
from ple import PLE

p = PLE(game, fps=30, display_screen=True, force_fps=False)
p.init()

In [11]:
print (p.game.actions)


{'up': 119, 'right': 100, 'down': 115, 'left': 97}


In [7]:
nb_frames = 1000
reward = 0.0

for f in range(nb_frames):
    if p.game_over(): #check if the game is over
        p.reset_game()

    obs = p.getScreenRGB()
    action=119
    #     action = myAgent.pickAction(reward, obs)
    reward = p.act(action)

In [2]:
import numpy as np
from ple import PLE
from ple.games.waterworld import WaterWorld


# lets adjust the rewards our agent recieves
rewards = {
    "tick": -0.01,  # each time the game steps forward in time the agent gets -0.1
    "positive": 1.0,  # each time the agent collects a green circle
    "negative": -5.0,  # each time the agent bumps into a red circle
}

# make a PLE instance.
# use lower fps so we can see whats happening a little easier
game = WaterWorld(width=256, height=256, num_creeps=8)
p = PLE(game, fps=15, force_fps=False, display_screen=True,
        reward_values=rewards)
# we pass in the rewards and PLE will adjust the game for us

p.init()
actions = p.getActionSet()
for i in range(1000):
    if p.game_over():
        p.reset_game()

    action = actions[np.random.randint(0, len(actions))]  # random actions
    reward = p.act(action)

    print ("Score: {:0.3f} | Reward: {:0.3f} ".format(p.score(), reward))


couldn't import doomish
Couldn't import doom
Score: 0.990 | Reward: 0.990 
Score: 0.980 | Reward: -0.010 
Score: 0.970 | Reward: -0.010 
Score: 0.960 | Reward: -0.010 
Score: 0.950 | Reward: -0.010 
Score: 0.940 | Reward: -0.010 
Score: 0.930 | Reward: -0.010 
Score: 0.920 | Reward: -0.010 
Score: 0.910 | Reward: -0.010 
Score: 0.900 | Reward: -0.010 
Score: 0.890 | Reward: -0.010 
Score: 0.880 | Reward: -0.010 
Score: 0.870 | Reward: -0.010 
Score: 0.860 | Reward: -0.010 
Score: 0.850 | Reward: -0.010 
Score: 0.840 | Reward: -0.010 
Score: 0.830 | Reward: -0.010 
Score: 0.820 | Reward: -0.010 
Score: 0.810 | Reward: -0.010 
Score: 0.800 | Reward: -0.010 
Score: 0.790 | Reward: -0.010 
Score: 0.780 | Reward: -0.010 
Score: 0.770 | Reward: -0.010 
Score: 0.760 | Reward: -0.010 
Score: 0.750 | Reward: -0.010 
Score: 0.740 | Reward: -0.010 
Score: 0.730 | Reward: -0.010 
Score: 0.720 | Reward: -0.010 
Score: 0.710 | Reward: -0.010 
Score: 0.700 | Reward: -0.010 
Score: 0.690 | Reward: -0.

KeyboardInterrupt: 

### DQN agent class 

This class is DQN based agent taking image as input
- 

In [1]:
import numpy as np
from collections import deque

# keras and model related
from keras.models import Sequential
from keras.layers.core import Dense, Flatten
from keras.layers.convolutional import Convolution2D
from keras.optimizers import SGD, Adam, RMSprop
import theano.tensor as T
import os

os.environ["KERAS_BACKEND"] = "theano"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

class ExampleAgent():
    """
        Implements a DQN-ish agent. It has replay memory and epsilon decay. It is missing model freezing. The models are sensitive to the parameters and if applied to other games must be tinkered with.
    """

    def __init__(self, env, batch_size, num_frames,
                 frame_skip, lr, discount, rng, optimizer="adam", frame_dim=None):

        self.env = env
        self.batch_size = batch_size
        self.num_frames = num_frames
        self.frame_skip = frame_skip
        self.lr = lr
        self.discount = discount
        self.rng = rng

        if optimizer == "adam":
            opt = Adam(lr=self.lr)
        elif optimizer == "sgd":
            opt = SGD(lr=self.lr)
        elif optimizer == "sgd_nesterov":
            opt = SGD(lr=self.lr, nesterov=True)
        elif optimizer == "rmsprop":
            opt = RMSprop(lr=self.lr, rho=0.9, epsilon=0.003)
        else:
            raise ValueError("Unrecognized optmizer")

        self.optimizer = opt

        self.frame_dim = self.env.getScreenDims() if frame_dim is None else frame_dim
        self.state_shape = (num_frames,) + self.frame_dim
        self.input_shape = (batch_size,) + self.state_shape

        self.state = deque(maxlen=num_frames)
        self.actions = self.env.getActionSet()
        self.num_actions = len(self.actions)
        self.model = None

    def q_loss(self, y_true, y_pred):
        # assume clip_delta is 1.0
        # along with sum accumulator.
        diff = y_true - y_pred
        _quad = T.minimum(abs(diff), 1.0)
        _lin = abs(diff) - _quad
        loss = 0.5 * _quad ** 2 + _lin
        loss = T.sum(loss)

        return loss

    def build_model(self):

        model = Sequential()
        model.add(Convolution2D(
            16, 8, 8, input_shape=(self.num_frames,) + self.frame_dim,
            subsample=(4, 4), activation="relu", init="he_uniform"
        ))
        model.add(Convolution2D(
            16, 4, 4, subsample=(2, 2), activation="relu", init="he_uniform"
        ))
        model.add(Convolution2D(
            32, 3, 3, subsample=(1, 1), activation="relu", init="he_uniform"
        ))
        model.add(Flatten())
        model.add(Dense(
            512, activation="relu", init="he_uniform"
        ))
        model.add(Dense(
            self.num_actions, activation="linear", init="he_uniform"
        ))

        model.compile(loss=self.q_loss, optimizer=self.optimizer)

        self.model = model

    def predict_single(self, state):
        """
            model is expecting a batch_size worth of data. We only have one states worth of
            samples so we make an empty batch and set our state as the first row.
        """
        states = np.zeros(self.input_shape)
        states[0, ...] = state.reshape(self.state_shape)

        return self.model.predict(states)[0]  # only want the first value

    def _argmax_rand(self, arr):
        # picks a random index if there is a tie
        return self.rng.choice(np.where(arr == np.max(arr))[0])

    def _best_action(self, state):
        q_vals = self.predict_single(state)

        return self._argmax_rand(q_vals)  # the action with the best Q-value

    def act(self, state, epsilon=1.0):
        self.state.append(state)

        action = self.rng.randint(0, self.num_actions)
        if len(self.state) == self.num_frames:  # we havent seen enough frames
            _state = np.array(self.state)

            if self.rng.rand() > epsilon:
                action = self._best_action(_state)  # exploit

        reward = 0.0
        for i in range(self.frame_skip):  # we repeat each action a few times
            # act on the environment
            reward += self.env.act(self.actions[action])

        reward = np.clip(reward, -1.0, 1.0)

        return reward, action

    def start_episode(self, N=3):
        self.env.reset_game()  # reset
        for i in range(self.rng.randint(N)):
            self.env.act(self.env.NOOP)  # perform a NOOP

    def end_episode(self):
        self.state.clear()


class ReplayMemory():

    def __init__(self, max_size, min_size):
        self.min_replay_size = min_size
        self.memory = deque(maxlen=max_size)

    def __len__(self):
        return len(self.memory)

    def add(self, transition):
        self.memory.append(transition)

    def train_agent_batch(self, agent):
        if len(self.memory) > self.min_replay_size:
            states, targets = self._random_batch(agent)  # get a random batch
            return agent.model.train_on_batch(states, targets)  # ERR?
        else:
            return None

    def _random_batch(self, agent):
        inputs = np.zeros(agent.input_shape)
        targets = np.zeros((agent.batch_size, agent.num_actions))

        seen = []
        idx = agent.rng.randint(
            0,
            high=len(
                self.memory) -
            agent.num_frames -
            1)

        for i in range(agent.batch_size):
            while idx in seen:
                idx = agent.rng.randint(0, high=len(
                    self.memory) - agent.num_frames - 1)

            states = np.array([self.memory[idx + j][0]
                               for j in range(agent.num_frames + 1)])
            art = np.array([self.memory[idx + j][1:]
                            for j in range(agent.num_frames)])

            actions = art[:, 0].astype(int)
            rewards = art[:, 1]
            terminals = art[:, 2]

            state = states[:-1]
            state_next = states[1:]

            inputs[i, ...] = state.reshape(agent.state_shape)
            # we could make zeros but pointless.
            targets[i] = agent.predict_single(state)
            Q_prime = np.max(agent.predict_single(state_next))

            targets[i, actions] = rewards + \
                (1 - terminals) * (agent.discount * Q_prime)

            seen.append(idx)

        return inputs, targets


def loop_play_forever(env, agent):
    # our forever play loop
    try:
        # slow it down
        env.display_screen = True
        env.force_fps = False

        while True:
            agent.start_episode()
            episode_reward = 0.0
            while env.game_over() == False:
                state = env.getGameState()
                reward, action = agent.act(state, epsilon=0.05)
                episode_reward += reward

            print ("Agent score {:0.1f} reward for episode.".format(episode_reward))
            agent.end_episode()

    except KeyboardInterrupt:
        print ("Exiting out!")

Using Theano backend.
Using gpu device 0: GeForce GTX TITAN X (CNMeM is enabled with initial size: 90.0% of memory, cuDNN Mixed dnn version. The header is from one version, but we link with a different version (6021, 5110))


In [3]:
# thanks to @edersantana and @fchollet for suggestions & help.

import numpy as np
from ple import PLE  # our environment
from ple.games.catcher import Catcher
from ple.games.waterworld import WaterWorld

from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import SGD

# from example_support import ExampleAgent, ReplayMemory, loop_play_forever


class Agent(ExampleAgent):
    """
        Our agent takes 1D inputs which are flattened.
        We define a full connected model below.
    """

    def __init__(self, *args, **kwargs):
        ExampleAgent.__init__(self, *args, **kwargs)

        self.state_dim = self.env.getGameStateDims()
        self.state_shape = np.prod((num_frames,) + self.state_dim)
        self.input_shape = (batch_size, self.state_shape)

    def build_model(self):
        model = Sequential()
        model.add(Dense(
            input_dim=self.state_shape, output_dim=256, activation="relu", init="he_uniform"
        ))
        model.add(Dense(
            512, activation="relu", init="he_uniform"
        ))
        model.add(Dense(
            self.num_actions, activation="linear", init="he_uniform"
        ))

        model.compile(loss=self.q_loss, optimizer=SGD(lr=self.lr))

        self.model = model


def nv_state_preprocessor(state):
    """
        This preprocesses our state from PLE. We rescale the values to be between
        0,1 and -1,1.
    """
    # taken by inspection of source code. Better way is on its way!
    max_values = np.array([128.0, 20.0, 128.0, 128.0])
#     import pdb;pdb.set_trace()
    state = np.array(list(state.values())) / max_values

    return state.flatten()

if __name__ == "__main__":
    # this takes about 15 epochs to converge to something that performs decently.
    # feel free to play with the parameters below.

    # training parameters
    num_epochs = 15
    num_steps_train = 15000  # steps per epoch of training
    num_steps_test = 3000
    update_frequency = 4  # step frequency of model training/updates

    # agent settings
    batch_size = 32
    num_frames = 4  # number of frames in a 'state'
    frame_skip = 2
    # percentage of time we perform a random action, help exploration.
    epsilon = 0.15
    epsilon_steps = 30000  # decay steps
    epsilon_min = 0.1
    lr = 0.01
    discount = 0.95  # discount factor
    rng = np.random.RandomState(24)

    # memory settings
    max_memory_size = 100000
    min_memory_size = 1000  # number needed before model training starts

    epsilon_rate = (epsilon - epsilon_min) / epsilon_steps

    # PLE takes our game and the state_preprocessor. It will process the state
    # for our agent.
    game = WaterWorld(width=128, height=128)
    env = PLE(game, fps=60, state_preprocessor=nv_state_preprocessor, display_screen=True, force_fps=False)

    agent = Agent(env, batch_size, num_frames, frame_skip, lr,
                  discount, rng, optimizer="sgd_nesterov")
    agent.build_model()

    memory = ReplayMemory(max_memory_size, min_memory_size)

    env.init()

    for epoch in range(1, num_epochs + 1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False

        # training loop
        while steps < num_steps_train:
            episode_reward = 0.0
            agent.start_episode()

            while env.game_over() == False and steps < num_steps_train:
                state = env.getGameState()
                reward, action = agent.act(state, epsilon=epsilon)
                memory.add([state, action, reward, env.game_over()])

                if steps % update_frequency == 0:
                    loss = memory.train_agent_batch(agent)

                    if loss is not None:
                        losses.append(loss)
                        epsilon = np.max([epsilon_min, epsilon - epsilon_rate])

                episode_reward += reward
                steps += 1

            if num_episodes % 5 == 0:
                print ("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward))

            rewards.append(episode_reward)
            num_episodes += 1
            agent.end_episode()

        print ("\nTrain Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}".format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes)
)
        steps, num_episodes = 0, 0
        losses, rewards = [], []

        # display the screen
        env.display_screen = True

        # slow it down so we can watch it fail!
        env.force_fps = False

        # testing loop
        while steps < num_steps_test:
            episode_reward = 0.0
            agent.start_episode()

            while env.game_over() == False and steps < num_steps_test:
                state = env.getGameState()
                reward, action = agent.act(state, epsilon=0.05)

                episode_reward += reward
                steps += 1

                # done watching after 500 steps.
                if steps > 500:
                    env.force_fps = True
                    env.display_screen = False

            if num_episodes % 5 == 0:
                print ("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward))

            rewards.append(episode_reward)
            num_episodes += 1
            agent.end_episode()

        print ("Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}".format(epoch, np.max(rewards), np.sum(rewards) / num_episodes))

    print ("\nTraining complete. Will loop forever playing!")
    loop_play_forever(env, agent)

ValueError: operands could not be broadcast together with shapes (6,) (4,) 

In [5]:
np.max([0.12, -.3333])

0.12