In [1]:
import gym
import random
import numpy as np
import tensorflow as tf
from collections import deque
from skimage.color import rgb2gray
from skimage.transform import resize
from keras.models import Model
from keras.optimizers import RMSprop
from keras.layers import Input, Dense, Flatten, Lambda, merge
from keras.layers.convolutional import Conv2D
from keras import backend as K

  return f(*args, **kwds)
Using TensorFlow backend.


In [2]:
# The Deterministic-v4 version of the Atari environments is exactly equivalent to what DeepMind used in their paper.
# Specifically, "-v4" essentially implements frame skipping so that the agent acts on every 4th frame. 
# This reduces the number of frames to process and thus reduces training time 
# without sacrificing agent's game performance.
env = gym.make('BreakoutDeterministic-v4')

[2018-02-11 01:58:09,386] Making new env: BreakoutDeterministic-v4


In [3]:
EPISODES = 50000
MODEL_PATH = "model/breakout_dueling_ddqn.h5"
TENSORBOARD_LOG_PATH = 'summary/breakout_dueling_ddqn'
MODEL_CHECKPOINT_FREQUENCY = 100

# actions:
NO_ACTION_NO_BALL = 0
NO_ACTION = 1
MOVE_RIGHT = 2
MOVE_LEFT = 3

In [4]:
# 210x160x3 (RGB color) -> 84x84 (grayscale)
# also transforms floats to 8-bit ints to conserve memory
def preprocess(observed_state):
    return np.uint8(resize(rgb2gray(observed_state), (84, 84), mode='constant') * 255)

In [5]:
def get_initial_state_history(observed_state):
    # No preceding frames at the start of an episode, so just copy 4 times
    state = preprocess(observed_state)
    history = np.stack((state, state, state, state), axis=2)
    history = np.reshape([history], (1, 84, 84, 4))
    return state, history

In [6]:
class DuelingDDQNAgent:
    def __init__(self, action_size):
        self.render = False
        self.load_model = False

        # environment settings
        self.state_size = (84, 84, 4)
        self.action_size = action_size

        # epsilon-greedy policy parameters
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0.1
        self.exploration_steps = 1000000.
        self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) / self.exploration_steps

        # training parameters
        self.batch_size = 32
        self.train_start = 50000
        self.update_target_rate = 10000
        self.discount_factor = 0.99
        self.memory = deque(maxlen=400000)
        self.no_op_steps = 30

        # build
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

        self.optimizer = self.optimizer()

        self.sess = tf.InteractiveSession()
        K.set_session(self.sess)

        self.avg_q_max, self.avg_loss = 0, 0
        self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
        self.summary_writer = tf.summary.FileWriter(TENSORBOARD_LOG_PATH, self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

        if self.load_model:
            self.model.load_weights(MODEL_PATH)

    # if cost is in [-1; 1] cost is quadratic to error, otherwise – linear
    def optimizer(self):
        a = K.placeholder(shape=(None,), dtype='int32')
        y = K.placeholder(shape=(None,), dtype='float32')

        py_x = self.model.output

        a_one_hot = K.one_hot(a, self.action_size)
        q_value = K.sum(py_x * a_one_hot, axis=1)
        error = K.abs(y - q_value)

        quadratic_part = K.clip(error, 0.0, 1.0)
        linear_part = error - quadratic_part
        loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)

        optimizer = RMSprop(lr=0.00025, epsilon=0.01)
        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
        train = K.function([self.model.input, a, y], [loss], updates=updates)

        return train

    # frame representation -> CNN -> advantage/value streams -> merge two streams -> Q-value of each action
    def build_model(self):
        input = Input(shape=self.state_size)
        shared = Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(input)
        shared = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(shared)
        shared = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(shared)
        flatten = Flatten()(shared)

        # stream that estimates state-dependent action advantages
        advantage_fc = Dense(512, activation='relu')(flatten)
        advantage = Dense(self.action_size)(advantage_fc)
        advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True),
                           output_shape=(self.action_size,))(advantage)

        # stream which estimates state values
        value_fc = Dense(512, activation='relu')(flatten)
        value = Dense(1)(value_fc)
        value = Lambda(lambda s: K.expand_dims(s[:, 0], -1),
                       output_shape=(self.action_size,))(value)

        # merge two streams to produce Q-value
        q_value = merge([value, advantage], mode='sum')
        model = Model(inputs=input, outputs=q_value)
        model.summary()

        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # epsilon-greedy policy
    def get_action(self, history):
        history = np.float32(history / 255.0)
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(history)
            return np.argmax(q_value[0])

    # save sample <s, a, r, s'> to the replay memory
    def replay_memory(self, history, action, reward, next_history, dead):
        self.memory.append((history, action, reward, next_history, dead))

    # sample from replay memory (batches of specified size)
    def train_replay(self):
        if len(self.memory) < self.train_start:
            return
        if self.epsilon > self.epsilon_end:
            self.epsilon -= self.epsilon_decay_step

        mini_batch = random.sample(self.memory, self.batch_size)

        history = np.zeros((self.batch_size, self.state_size[0],
                            self.state_size[1], self.state_size[2]))
        next_history = np.zeros((self.batch_size, self.state_size[0],
                                 self.state_size[1], self.state_size[2]))
        target = np.zeros((self.batch_size,))
        action, reward, dead = [], [], []

        for i in range(self.batch_size):
            history[i] = np.float32(mini_batch[i][0] / 255.)
            next_history[i] = np.float32(mini_batch[i][3] / 255.)
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            dead.append(mini_batch[i][4])

        value = self.model.predict(history)
        target_value = self.target_model.predict(next_history)

        # get max Q-value at s' from target model
        for i in range(self.batch_size):
            if dead[i]:
                target[i] = reward[i]
            else:
                # key DDQN idea: select action from model, update value from target model
                target[i] = reward[i] + self.discount_factor * target_value[i][np.argmax(value[i])]

        loss = self.optimizer([history, action, target])
        self.avg_loss += loss[0]

    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        episode_avg_max_q = tf.Variable(0.)
        episode_duration = tf.Variable(0.)
        episode_avg_loss = tf.Variable(0.)

        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
        tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
        tf.summary.scalar('Duration/Episode', episode_duration)
        tf.summary.scalar('Average Loss/Episode', episode_avg_loss)

        summary_vars = [episode_total_reward, episode_avg_max_q,
                        episode_duration, episode_avg_loss]
        summary_placeholders = [tf.placeholder(tf.float32) for _ in
                                range(len(summary_vars))]
        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
                      range(len(summary_vars))]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

In [7]:
agent = DuelingDDQNAgent(action_size=3)

scores, episodes, global_step = [], [], 0

for e in range(EPISODES):
    done = False
    dead = False
    # 1 episode = 5 lives
    step, score, start_life = 0, 0, 5
    observed_state = env.reset()

    # One of the ideas from DeepMind's papers: don't take any action for
    # some random number of steps at the start of an episode
    for _ in range(random.randint(1, agent.no_op_steps)):
        observed_state, _, _, _ = env.step(NO_ACTION)

    state, history = get_initial_state_history(observed_state)

    while not done:
        if agent.render:
            env.render()
        global_step += 1
        step += 1

        # get action for the current history and go one step in environment
        action = agent.get_action(history)

        # model returns one of the three actions (from [0; 2]),
        # but 'real' actions are 1 through 3, because 0 is no-op (no action, no ball)
        real_action = action + 1

        observed_state, reward, done, info = env.step(real_action)

        # pre-process the observation --> history
        next_state = preprocess(observed_state)
        next_state = np.reshape([next_state], (1, 84, 84, 1))
        next_history = np.append(next_state, history[:, :, :, :3], axis=3)

        agent.avg_q_max += np.amax(
            agent.model.predict(np.float32(history / 255.))[0])

        # check if the agent is 'dead' and update remaining lives
        if start_life > info['ale.lives']:
            dead = True
            start_life = info['ale.lives']

        reward = np.clip(reward, -1., 1.)

        # save the sample <s, a, r, s', d> to the replay memory and train model
        agent.replay_memory(history, action, reward, next_history, dead)
        agent.train_replay()

        # every once in a while, update the target model
        if global_step % agent.update_target_rate == 0:
            agent.update_target_model()

        score += reward

        if dead:
            dead = False

            # reset history
            _, history = get_initial_state_history(observed_state)
        else:
            history = next_history

        # log progress to console and TensorBoard on episode end
        if done:
            if global_step > agent.train_start:
                stats = [score, agent.avg_q_max / float(step), step, agent.avg_loss / float(step)]
                for i in range(len(stats)):
                    agent.sess.run(agent.update_ops[i], feed_dict={
                        agent.summary_placeholders[i]: float(stats[i])
                    })
                summary_str = agent.sess.run(agent.summary_op)
                agent.summary_writer.add_summary(summary_str, e + 1)

            print("episode:", e, "  score:", score, "  memory length:",
                    len(agent.memory), "  epsilon:", agent.epsilon,
                    "  global_step:", global_step, "  average_q:",
                    agent.avg_q_max / float(step), "  average loss:",
                    agent.avg_loss / float(step))

            agent.avg_q_max, agent.avg_loss = 0, 0

    if e % MODEL_CHECKPOINT_FREQUENCY == 0:
        agent.model.save_weights(MODEL_PATH)

  name=name)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 84, 84, 4)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 20, 20, 32)   8224        input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 9, 9, 64)     32832       conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_3 (Conv2D)               (None, 7, 7, 64)     36928       conv2d_2[0][0]                   
__________________________________________________________________________________________________
flatten_1 

[2018-02-11 01:58:11,678] Summary name Total Reward/Episode is illegal; using Total_Reward/Episode instead.


INFO:tensorflow:Summary name Average Max Q/Episode is illegal; using Average_Max_Q/Episode instead.


[2018-02-11 01:58:11,681] Summary name Average Max Q/Episode is illegal; using Average_Max_Q/Episode instead.


INFO:tensorflow:Summary name Average Loss/Episode is illegal; using Average_Loss/Episode instead.


[2018-02-11 01:58:11,684] Summary name Average Loss/Episode is illegal; using Average_Loss/Episode instead.


episode: 0   score: 1.0   memory length: 126   epsilon: 1.0   global_step: 126   average_q: -0.033501316542   average loss: 0.0
episode: 1   score: 1.0   memory length: 264   epsilon: 1.0   global_step: 264   average_q: -0.0337682404434   average loss: 0.0
episode: 2   score: 2.0   memory length: 464   epsilon: 1.0   global_step: 464   average_q: -0.0361336218007   average loss: 0.0
episode: 3   score: 4.0   memory length: 764   epsilon: 1.0   global_step: 764   average_q: -0.0345563011989   average loss: 0.0
episode: 4   score: 2.0   memory length: 960   epsilon: 1.0   global_step: 960   average_q: -0.0350767627967   average loss: 0.0
episode: 5   score: 2.0   memory length: 1150   epsilon: 1.0   global_step: 1150   average_q: -0.0368856426917   average loss: 0.0
episode: 6   score: 0.0   memory length: 1268   epsilon: 1.0   global_step: 1268   average_q: -0.0353898896631   average loss: 0.0
episode: 7   score: 1.0   memory length: 1418   epsilon: 1.0   global_step: 1418   average_q: 

KeyboardInterrupt: 