
### Done
* Forward pass
* Backwards pass
* Summaries
* Replay buffer
* target network
* Save model
* Breakout 
* preprocessing


### Missing
* Action repeats
* TensorBoard Summaries
* Let it train

### Notes
Inspired by: https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-4-deep-q-networks-and-beyond-8438a3e2b8df

# DQN on Breakout



In [None]:
%%javascript

Jupyter.keyboard_manager.command_shortcuts.add_shortcut('r', {
    help : 'run all cells',
    help_index : 'zz',
    handler : function (event) {
        IPython.notebook.execute_all_cells();
        return false;
    }
});

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

%matplotlib inline
import os
import sys
import shutil
import numpy as np
import time
import matplotlib.pyplot as plt

import tensorflow as tf
import gym

from tensorflow.contrib.keras.api.keras.layers import Dense, Input, Conv2D, Flatten
from tensorflow.contrib.keras.api.keras.models import Model

import utils

In [None]:
## Hyperparameters
update_frequency = 4
batch_size = 32
training_time = int(50e6) # frames
# training_time = int(1e6) # frames
target_size = [84,84] #List of integer
num_frames = 4 # Integer
gray = True

replay_buffer_size = int(1e5)
# replay_buffer_size = int(1e5)
minimum_experience = replay_buffer_size/10

learning_rate = 0.0001
gamma = 0.99
epsilon_init = 1.0
epsilon_end = 0.1

annealing_period = 1e6 # frames
# annealing_period = 5e5 # frames

In [None]:
## Parameters
if 1:
    env_name = 'Breakout-v0'
    model_type = 'DQN'
elif 0:
    env_name = 'CartPole-v1'
    model_type = 'dense'


In [None]:
## Derived settings
tf.reset_default_graph()
annealer = utils.Annealer(epsilon_init, epsilon_end, annealing_period)

env = gym.make(env_name)
obs = env.reset()
s_size = list(obs.shape)
a_size = env.action_space.n
n_channels = 1 if gray else 4
target_dim = target_size + [num_frames*n_channels]
print('-- Environmental variables --')
print('env_name  ', env_name)
print('model_type', model_type)
print('s_size    ', s_size)
print('a_size    ', a_size)
print('target_dim', target_dim)

obsPlaceholder = tf.placeholder(tf.float32, shape=[None]+target_dim, name='obsPlaceholder')
# Assume action is encoded as ONE number
actionPlaceholder = tf.placeholder(tf.int32, shape=[None], name='actionPlaceholder')
targetQPlaceholder = tf.placeholder(tf.float32, shape=[None], name='targetQPlaceholder')

print('\n-- Placeholders --')
print('obsPlaceholder    ', obsPlaceholder.get_shape())
print('actionPlaceholder ', actionPlaceholder.get_shape())
print('targetQPlaceholder', targetQPlaceholder.get_shape())
print()

### Create model
from models import DQN
with tf.name_scope('DQN1'):
    DQN1 = DQN(model_type, obsPlaceholder, actionPlaceholder, a_size)
    DQN1.create_MSE_train_op(targetQPlaceholder, learning_rate=learning_rate)

with tf.name_scope('DQN2'):
    DQN2 = DQN(model_type, obsPlaceholder, actionPlaceholder, a_size)
    DQN2.create_MSE_train_op(targetQPlaceholder, learning_rate=learning_rate)

print('Model summary')
DQN1.model.summary()

In [None]:
## Preprocessor
preprocessor = utils.DataHandler()
preprocessor.define_preprocess_2d(target_size=target_size, num_frames=num_frames, gray=gray)


In [None]:
## Stuff for saving the model
path = "./dqn" #The path to save our model to.
saver = tf.train.Saver()

In [None]:
## Script specific helper functions
def new_episode(env):
    """Simple wrapper that restarts the environment"""
    obs = env.reset()
    episode_time_step = 0
    episode_reward = 0
    return obs, episode_time_step, episode_reward


## Stuff for updating the target graph
def updateTargetGraph(tfVars,tau):
    # TODO: FIX THIS
    # This is a terrible way of doing it, it relies on the order in which the networks were created
    total_vars = len(tfVars)
    op_holder = []
    for idx,var in enumerate(tfVars[0:total_vars//2]):
        op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
    return op_holder


def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)

tau = 0.001 #Rate to update target network toward primary network
trainables = tf.trainable_variables()
targetOps = updateTargetGraph(trainables,tau)

In [None]:
buffer = utils.Experience_buffer(replay_buffer_size)
f = 0
training_summaries = {}
training_summaries['num_ep'] = 0
training_summaries['ep_rewards'] = []
training_summaries['epsilon'] = []

load_model = False

In [None]:
load_model = True

In [None]:
assert update_frequency > 1, "The dimensions doen't allign if update_frequency" \
    + 'is less than 1: ' + str(update_frequency)


try:
    print('Session restarted')
    sess.close()
except:
    pass
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# TODO: Use TF summaries, this is shit

if load_model == True:
    print('Loading Model...')
    ckpt = tf.train.get_checkpoint_state(path)
    try:
        saver.restore(sess, ckpt.model_checkpoint_path)
    except AttributeError:
        if not os.path.exists(path):
            os.mkdir(path)
        print('WARNING: Could not load previous model')

else:
    if os.path.exists(path):
        print('Deleting old model')
        shutil.rmtree(path)
    os.mkdir(path)

preprocessor.reset_buffer_2d()
obs, ep_t, ep_r = new_episode(env)
preprocessor.add_2d(obs)
obs_input = preprocessor.get_buffer_2d()

while f < training_time:
    try:
        f += 1
        epsilon = annealer.linear(f-minimum_experience)
        action = sess.run(DQN1.action, feed_dict={obsPlaceholder : [obs_input]}) \
            if np.random.rand(1) > epsilon else np.random.randint(a_size)
        action = int(action)
#         print('action', type(action), action)
        assert 'int' in str(type(action)), 'action must be an int, not a ' + str(type(action))
        reward = 0
        for i in range(4):
            next_obs, r, done, _ = env.step(action)
            r = np.clip(r, -1, 1)
            reward += r
            if done:
                break
        preprocessor.add_2d(next_obs)
        next_obs_input = preprocessor.get_buffer_2d()

        # TODO: wrap in action repeater
        ep_t += 1
        ep_r += reward
        if 'CartPole' in env_name and done and ep_t < 400 :
            # Necessary help for he Q-network to avoid death
            reward = -1

        experience = {'obs':[obs_input], 'action':action, 'reward':reward,
                      'next_obs':[next_obs_input], 'done':done}
        buffer.add(experience)
        obs = next_obs
        
        ## Update Weights
        if f % update_frequency == 0 \
                and f > minimum_experience:
            train_batch = buffer.sample(batch_size)

            # Compute ... TODO: add description
            [action] = sess.run([DQN1.action],
                feed_dict={obsPlaceholder : train_batch['next_obs']})
            [actualQs] = sess.run([DQN2.Qout],
                feed_dict={obsPlaceholder : train_batch['next_obs']})

            actualQ = actualQs[range(batch_size), action] # The Q value of the chosen action
            zero_if_done = train_batch['done']*(-1) + 1 # Used to remove the actualQ
                                                        # When at terminal state
            target = train_batch['reward'] + gamma*actualQ*zero_if_done

            ## Update DQN1
            DQN1_train_dict = {
                        obsPlaceholder : train_batch['obs'],
                        actionPlaceholder : train_batch['action'],
                        targetQPlaceholder : target # TODO: Should be target network!
                }
            _ = sess.run(DQN1.train_op, feed_dict=DQN1_train_dict)
            
            ## Update DQN2
            updateTarget(targetOps,sess)
        
        ## Track training
        track_interval = 1000
        if f % track_interval == 0:
            if f==0:
                print('\n{:>9}  {:>7}  {:>7}  {:>7}'.format('frames', 'epis', 'reward', 'epsilon'),end='')
            else:
                print('{:9}, {:7}, {:7.1f}, {:7.3f}'.format(
                      f, 
                      training_summaries['num_ep'],
                      np.mean(training_summaries['ep_rewards'][-100:]),
                      training_summaries['epsilon'][-1],
                     ), end='')
            if f % (track_interval*5) == 0 and f>0:
                i = training_summaries['num_ep']
                model_save_str = path+'/model-'+str(i)+'.cptk'
                saver.save(sess, model_save_str)
                print("\tSaved Model: " + model_save_str)
            else:
                print()
        
        if training_summaries['num_ep'] % 100 == 0:
            env.render()

        if done:
            training_summaries['num_ep'] += 1
            training_summaries['ep_rewards'].append(ep_r)
            training_summaries['epsilon'].append(epsilon)
            preprocessor.reset_buffer_2d()
            obs, ep_t, ep_r = new_episode(env)
            preprocessor.add_2d(obs)
            obs_input = preprocessor.get_buffer_2d()

            
    except KeyboardInterrupt:
        break
        

env.render(close=True)
print('\nTerminated')

In [None]:
print(buffer.buffer_size())
sys.getsizeof(buffer.buffer) # bytes

In [None]:
class EnvironmentInterface():
    def __init__(self):
        pass
    def define_take_action(self):
        pass
    
    def take_action(self):
        pass


In [None]:
## Run and render a forward pass
import time

env = gym.make(env_name)
obs = env.reset()
preprocessor.reset_buffer_2d()
preprocessor.add_2d(obs)
obs_input = preprocessor.get_buffer_2d()

env.render()
reward_sum = 0



while True:
    try:
        action = sess.run(DQN1.action, feed_dict={obsPlaceholder : [obs_input]}) if np.random.rand(1) > epsilon \
            else np.random.randint(a_size)
        action = int(action)
        assert 'int' in str(type(action))
        time.sleep(0.025)
        reward = 0
        for i in range(4):
            obs, r, done, _ = env.step(action)
            r = np.clip(r, -1, 1)
            reward += r
            if done:
                break
        preprocessor.add_2d(obs)
        obs_input = preprocessor.get_buffer_2d()

        env.render()
        reward_sum += reward
        if done:
            print(reward_sum)
            reward_sum = 0
            obs = env.reset()
            preprocessor.reset_buffer_2d()
            preprocessor.add_2d(obs)
            obs_input = preprocessor.get_buffer_2d()

    except KeyboardInterrupt:
        break

# env.render(close=True)
print('Terminated')

In [None]:
env.render(close=True)


<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>