# Gym Test Environment

In [None]:
# The typical imports
import gym
from gym.wrappers import Monitor
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Imports specifically so we can render outputs in Jupyter.
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

import keras
import itertools
from tqdm import tqdm_notebook as tqdm

In [None]:
def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='loop'))

In [None]:

# Create a breakout environment
env = gym.make('BreakoutDeterministic-v4')
# Reset it, returns the starting frame
frames = []
frame = env.reset()
# Render
#env.render()

is_done = False
while not is_done:
  # Perform a random action, returns the new frame, reward and whether the game is over
  frame, reward, is_done, _ = env.step(env.action_space.sample())
  # Render
  frames.append(env.render(mode = 'rgb_array'))
env.close()
display_frames_as_gif(frames)


# Preprocessing

In [None]:
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    return to_grayscale(downsample(img))

In [None]:
def transform_reward(reward):
        return np.sign(reward)

# The Model

In [None]:
def fit_batch(model, gamma, start_states, actions, rewards, next_states, is_terminal):
    """Do one deep Q learning iteration.
    
    Params:
    - model: The DQN
    - gamma: Discount factor (should be 0.99)
    - start_states: numpy array of starting states
    - actions: numpy array of one-hot encoded actions corresponding to the start states
    - rewards: numpy array of rewards corresponding to the start states and actions
    - next_states: numpy array of the resulting states corresponding to the start states and actions
    - is_terminal: numpy boolean array of whether the resulting state is terminal
    
    """
    # First, predict the Q values of the next states. Note how we are passing ones as the mask.
    next_Q_values = model.predict([next_states, np.ones(actions.shape)])
    # The Q values of the terminal states is 0 by definition, so override them
    next_Q_values[is_terminal] = 0
    # The Q values of each start state is the reward + gamma * the max next state Q value
    Q_values = rewards + gamma * np.max(next_Q_values, axis=1)
    # Fit the keras model. Note how we are passing the actions as the mask and multiplying
    # the targets by the actions.
    model.fit(
        [start_states, actions], actions * Q_values[:, None],
        epochs=1, batch_size=len(start_states), verbose=0
    )

In [None]:
def atari_model(n_actions):
    # We assume a tensorflow backend here, so the "channels" are last.
    ATARI_SHAPE = (105, 80, 4)

    # With the functional API we need to define the inputs.
    frames_input = keras.layers.Input(ATARI_SHAPE, name='frames')
    actions_input = keras.layers.Input((n_actions,), name='mask')

    # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1].
    normalized = keras.layers.Lambda(lambda x: x / 255.0)(frames_input)
    
    # "The first hidden layer convolves 16 8×8 filters with stride 4 with the input image and applies a rectifier nonlinearity."
    conv_1 = keras.layers.convolutional.Conv2D(16, (8, 8), activation="relu", strides=(4, 4))(normalized)
    # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed by a rectifier nonlinearity."
    conv_2 = keras.layers.convolutional.Conv2D(32, (4, 4), activation="relu", strides=(2, 2))(conv_1)
    # Flattening the second convolutional layer.
    conv_flattened = keras.layers.core.Flatten()(conv_2)
    # "The final hidden layer is fully-connected and consists of 256 rectifier units."
    hidden = keras.layers.Dense(256, activation='relu')(conv_flattened)
    # "The output layer is a fully-connected linear layer with a single output for each valid action."
    output = keras.layers.Dense(n_actions)(hidden)
    # Finally, we multiply the output by the mask!
    filtered_output = keras.layers.merge([output, actions_input], mode='mul')

    model = keras.models.Model(input=[frames_input, actions_input], output=filtered_output)
    optimizer = optimizer=keras.optimizers.RMSprop(lr=0.00025, rho=0.95, epsilon=0.01)
    model.compile(optimizer, loss='mse')
    return model

## Memory Tool

The idea behind experience replay is quite simple: at each Q-learning iteration, you play one step in the game, but instead of updating the model based on that last step, you add all the relevant information from the step you just took (current state, next state, action taken, reward and whether the next state is terminal) to a finite-size memory (of 1,000,000 elements in this case), and then call fit_batch on a sample of that memory (of 32 elements in our case). Before doing any iterations on the neural network, we prefill the memory with a random policy up to a certain number of elements (50,000 in our case).


This ring buffer supports most of what you would expect (iteration, getting an arbitrary item etc.), but won’t work with random.sample. I recommend you simply implement a quick random sampling function for this. It shouldn’t be hard.

In [None]:
class RingBuf:
    def __init__(self, size):
        # Pro-tip: when implementing a ring buffer, always allocate one extra element,
        # this way, self.start == self.end always means the buffer is EMPTY, whereas
        # if you allocate exactly the right number of elements, it could also mean
        # the buffer is full. This greatly simplifies the rest of the code.
        self.data = [None] * (size + 1)
        self.start = 0
        self.end = 0
        
    def append(self, element):
        self.data[self.end] = element
        self.end = (self.end + 1) % len(self.data)
        # end == start and yet we just added one element. This means the buffer has one
        # too many element. Remove the first element by incrementing start.
        if self.end == self.start:
            self.start = (self.start + 1) % len(self.data)
            
    def add(self, state, action, new_frame, reward, is_done):
        self.append((state, action, new_frame, reward, is_done))
            
    def sample_batch(self, length):
        """
        Returns states_batch, action_batch, next_states_batch, reward_batch, done_batch
        """
        if length > len(self):
            samples =  self.data
        else:
            indices = np.random.randint(self.start, self.end, length)
            samples =  [self.data[i] for i in indices]
        return map(np.array, zip(*samples))
            
        
    def __getitem__(self, idx):
        return self.data[(self.start + idx) % len(self.data)]
    
    def __len__(self):
        if self.end < self.start:
            return self.end + len(self.data) - self.start
        else:
            return self.end - self.start
        
    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

In [None]:
test = RingBuf(10)
for i in range(10):
    test.append(np.array([i, i*2]))

In [None]:
i, i2 = test.sample_batch(3)
i, i2

# Putting it all together

Here are some functions that you have to implement yourself when following the blog post. These are called in the `q_iteration` function. Also the `sample_batch` function got added to the `RingBuf` class.

In [None]:
def get_epsilon_for_iteration(iteration):
    if(iteration >= 1000000):
        return 0.1
    else:
        return (-9e-7) * iteration + 1 

In [None]:
get_epsilon_for_iteration(0)

In [None]:
get_epsilon_for_iteration(999999)

In [None]:
get_epsilon_for_iteration(1000000)

In [None]:
def choose_best_action(epsilon, model, state):
    
    if np.random.random() < epsilon:
        return env.action_space.sample()
    
    state_ext = np.expand_dims(state, axis=0)
    action_mask = np.ones((4,state.shape[-1]))
    logits = model.predict([state_ext,action_mask])[0]
    # TODO: do not use argmax but multinomal (needs a softmax layer?)
    return np.argmax(logits)

The paper suggests to input the last four frames to the network, so it can figure out what the speed, directetion and acceleration of an object is. At the beginning of each game, we will initialise a buffer with 4 times the very first frame. Each game step we will remove the oldest frame and append the new frame. To do that we will implement a framebuffer class.

In [None]:
class Frame_Buffer:
    def __init__(self, size, frame):
        self.size = size
        self.buffer = np.stack([frame] * self.size, axis = 2)
        
    def get(self):
        return self.buffer
    
    def add(self, frame):
        newest_3_frames = self.buffer[:,:,1:]
        new_frame = np.expand_dims(frame, axis=2)
        self.buffer = np.append(newest_3_frames, new_frame, axis = 2)

In [None]:

state = env.reset()
state = preprocess(state)
buffer = Frame_Buffer(4, state)
new_state, *_ = env.step(1)
new_state = preprocess(new_state)
buffer.add(new_state)
buffer.get().shape

This function is from the blogpost.

In [None]:
def q_iteration(env, model, gamma, batch_size, iteration, state_buffer, replay_memory):
    # Choose epsilon based on the iteration
    epsilon = get_epsilon_for_iteration(iteration)
    
    action = choose_best_action(epsilon, model, state_buffer.get())
    one_hot_action = keras.utils.np_utils.to_categorical(action, num_classes=env.action_space.n)

    new_frame, reward, is_done, _ = env.step(action)
    frames = state_buffer.get()
    state_buffer.add(preprocess(new_frame))
    memory.add(frames, one_hot_action, state_buffer.get(), reward, is_done)

    # Sample and fit
    states_batch, action_batch, next_states_batch, reward_batch, done_batch = memory.sample_batch(batch_size)
    fit_batch(model, gamma, states_batch, action_batch, reward_batch, next_states_batch, done_batch)
    
    return is_done, reward

# Training Loop

Check if GPU is available with tensorflow. If there is only a CPU device listed follow this instructions.

If you already had tensorflow with GPU support setup once, it may help to reinstall tensorflow. For python 3.6 it is:

    pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp36-cp36m-linux_x86_64.whl


In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
# Hyperparams
MINIBATCH_SIZE = 32
REPLAY_MEMORY_SIZE = 200000
AGENT_HISTORY_LENGTH = 4
DISCOUNT_FACTOR = 0.99
ACTION_REPEAT = 4
REPLAY_START_SIZE = 50000

EPISODES = 1000

!mkdir videos
monitor_path = "./videos"
record_video_every = 20

save_model_every = 20

In [None]:
env = gym.make('BreakoutDeterministic-v4')
first_state = env.reset()
frame_buffer = Frame_Buffer(4, preprocess(first_state))
memory = RingBuf(1000000)
model = atari_model(env.action_space.n)

### Fill Replay Memory with random actions

In [None]:

for _ in tqdm(range(REPLAY_START_SIZE)):
    action = env.action_space.sample()
    one_hot_action = keras.utils.np_utils.to_categorical(action, num_classes=env.action_space.n)

    new_frame, reward, is_done, _ = env.step(action)
    frames = frame_buffer.get()
    frame_buffer.add(preprocess(new_frame))
    memory.add(frames, one_hot_action, frame_buffer.get(), reward, is_done)

    if is_done:
        first_state = env.reset()
        frame_buffer = Frame_Buffer(4, preprocess(first_state))

### Train Model while playing

In [None]:
env = gym.make('BreakoutDeterministic-v4')
env = Monitor(env,
                  directory=monitor_path,
                  resume=True,
                  video_callable=lambda count: count % record_video_every ==0)

total_reward = 0
reward_history = []
iteration = 0
for episode in  tqdm(range(EPISODES)):
    total_reward = 0
    is_done = False
    first_state = env.reset()
    frame_buffer = Frame_Buffer(4, preprocess(first_state))
    while not is_done:
        is_done, reward = q_iteration(env, model, DISCOUNT_FACTOR, MINIBATCH_SIZE, iteration, frame_buffer, memory)
        total_reward += reward
    print("Finished game: {} with reward: {}. Replay Memory Size: {}".format(episode + 1, total_reward, len(memory)), end='\r')
    reward_history.append(total_reward)
    if episode % save_model_every == 0:
        model.save("./models/ep_{}.hdf5".format(episode))

In [None]:
!mkdir models
model.save("./models/test.hdf5")

In [None]:
plt.plot(reward_history)

In [None]:
len(memory)

# Play a game

In [None]:

# Create a breakout environment
env = gym.make('BreakoutDeterministic-v4')
# Reset it, returns the starting frame
frames = []
frame = env.reset()

frame_buffer = Frame_Buffer(4, preprocess(frame))

is_done = False
while not is_done:
    action = choose_best_action(0.0, model, frame_buffer.get())

    frame, reward, is_done, _ = env.step(env.action_space.sample())
    # Render
    frame_buffer.add(preprocess(frame))
    frames.append(env.render(mode = 'rgb_array'))
env.close()
display_frames_as_gif(frames)
