### Done
* Forward pass
* Backwards pass
* Summaries
* Replay buffer
* target network
* Save model
* Breakout 
* preprocessing
* Action repeats
* Set max episode length! (double of the mean?)


### Missing
* TensorBoard Summaries
    * Track episode lengths!
* Encapsulate training in a class (or something!)
* Let it train
* Have a configure file that is saved along with the experiment
    * Be able to load config files!
* ? use int8 to store the experiences? ~ x4 memory saver

### Notes
Inspired by: https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-4-deep-q-networks-and-beyond-8438a3e2b8df

# DQN on Breakout



In [None]:
%%javascript

Jupyter.keyboard_manager.command_shortcuts.add_shortcut('r', {
    help : 'run all cells',
    help_index : 'zz',
    handler : function (event) {
        IPython.notebook.execute_all_cells();
        return false;
    }
});

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

%matplotlib inline
import os
import shutil
import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf
import gym

import utils

In [None]:
env_name = 'Breakout-v0'
env_name = 'CartPole-v1'

## Hyperparameters
IS_DEBUGGING = True # Make progam run faster
IS_DEBUGGING = False


# Model
num_frames = 4 # Integer: number of frames to feed the network
action_repeats = 4 # number of env steps to repeat action
gray = True # Make obs grayscale 
gamma = 0.99

# Training
# training_time = int(50e6) # frames
max_ep_t = 1e3
update_frequency = 4 # How often to update parameters
batch_size = 32
learning_rate = 0.0001
track_interval = 5000 # frames
render_interval = 100 # episodes

replay_buffer_size = int(1e6) # DOESN'T FIT in memory
replay_buffer_size = int(2e5)


epsilon_init = 1.0
epsilon_end = 0.1
# annealing_period = 1e6 # frames to wait, after buffer is full
annealing_period = 5e5 # frames

In [None]:
## Parameters
if env_name == 'Breakout-v0':
    path = "./logdir/dqn" #The path to save our model to.
    model_type = 'DQN'
    
    # IO parameters
    n_channels = 1 if gray else 3
    target_size = [84, 84, 1] # List of integer
    net_in_dim = target_size[:-1] + [num_frames*n_channels]
    
    # Create helper objects
    preprocessor = utils.Preprocessor_2d(\
            out_shape=target_size, gray=gray)
    envInter = utils.EnvironmentInterface(
            preprocessor=preprocessor, action_repeats=action_repeats,\
            merge_frames=True)
    obsBuf = utils.ObsBuffer(obs_shape=target_size, buffer_size=num_frames)

elif env_name == 'CartPole-v1':
    path = './logdir/dense'
    model_type = 'dense'
    
    # IO parameters
    net_in_dim = [4]

    # Behavior
    max_ep_t = np.inf
    render_interval = 100
    action_repeats = 1 # number of env steps to repeat action
    
    # Training
    replay_buffer_size = int(5e4)
    annealing_period = 2e5 # frames
    
    # Create helper objects
    preprocessor = None
    envInter = utils.EnvironmentInterface(
            preprocessor=preprocessor, action_repeats=action_repeats,\
            merge_frames=False)
    obsBuf = utils.ObsBuffer(obs_shape=net_in_dim)
else:
    print('ERROR! ' + env_name + ' not understood!!')
    

In [None]:
if IS_DEBUGGING:
    track_interval = 1000 # frames
    replay_buffer_size = int(5e3)
    annealing_period = 1e4 # frames


In [None]:
## Derived settings
tf.reset_default_graph()

env = gym.make(env_name)
obs = env.reset()
s_size = list(obs.shape) # Just fun fact, not actually used
a_size = env.action_space.n
minimum_experience = replay_buffer_size
print('-- Environmental variables --')
print('env_name  ', env_name)
print('model_type', model_type)
print('s_size    ', s_size)
print('net_in_dim', net_in_dim)
print('a_size    ', a_size)

obsPlaceholder = tf.placeholder(tf.float32, shape=[None]+net_in_dim, name='obsPlaceholder')
# Assume action is encoded as ONE number
actionPlaceholder = tf.placeholder(tf.int32, shape=[None], name='actionPlaceholder')
targetQPlaceholder = tf.placeholder(tf.float32, shape=[None], name='targetQPlaceholder')

print('\n-- Placeholders --')
print('obsPlaceholder    ', obsPlaceholder.get_shape())
print('actionPlaceholder ', actionPlaceholder.get_shape())
print('targetQPlaceholder', targetQPlaceholder.get_shape())
print()

### Create model
from models import DQN
with tf.name_scope('DQN1'):
    DQN1 = DQN(model_type, obsPlaceholder, actionPlaceholder, a_size)
    DQN1.create_MSE_train_op(targetQPlaceholder, learning_rate=learning_rate)

with tf.name_scope('DQN2'):
    DQN2 = DQN(model_type, obsPlaceholder, actionPlaceholder, a_size)
    DQN2.create_MSE_train_op(targetQPlaceholder, learning_rate=learning_rate)

print('Model summary')
DQN1.model.summary()

In [None]:
## Initialize helper classes
annealer = utils.Annealer(epsilon_init, epsilon_end, annealing_period)

saver = tf.train.Saver()

In [None]:
## Script specific helper functions
# Stuff for updating the target graph
def updateTargetGraph(tfVars,tau):
    # TODO: FIX THIS
    # This is a terrible way of doing it, it relies on the order in which the networks were created
    total_vars = len(tfVars)
    op_holder = []
    for idx,var in enumerate(tfVars[0:total_vars//2]):
        op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
    return op_holder

def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)


tau = 0.0005 #Rate to update target network toward primary network
trainables = tf.trainable_variables()
targetOps = updateTargetGraph(trainables,tau)

In [None]:
## In case of restart
expBuf = utils.Experience_buffer(replay_buffer_size)
f = -1
training_summaries = {}
training_summaries['num_ep'] = 0
training_summaries['ep_rewards'] = []
training_summaries['ep_length'] = []
training_summaries['epsilon'] = []

load_model = False

In [None]:
# load_model = True

In [None]:
assert update_frequency > 1, "The dimensions doen't allign if update_frequency" \
    + 'is less than 1: ' + str(update_frequency)

try:
    print('Session restarted')
    sess.close()
except:
    pass
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# TODO: Use TF summaries, this is shit

if load_model == True:
    print('Loading Model...')
    ckpt = tf.train.get_checkpoint_state(path)
    try:
        saver.restore(sess, ckpt.model_checkpoint_path)
    except AttributeError:
        if not os.path.exists(path):
            os.mkdir(path)
        print('WARNING: Could not load previous model')

else:
    if os.path.exists(path):
        print('Deleting old model')
        shutil.rmtree(path)
    os.mkdir(path)


obsBuf.reset()
obs, ep_t, ep_r = envInter.reset(env)
obsBuf.add(obs)
net_input = obsBuf.get()

while True:
    try:
        f += 1
        if f == minimum_experience:
            print('___/// BEGIN TRAINING \\\___')
        
        epsilon = annealer.linear(f-minimum_experience)
        if np.random.rand(1) > epsilon:
            action = sess.run(DQN1.action, feed_dict={obsPlaceholder : [net_input]})
        else:
            action = np.random.randint(a_size)
        action = int(action)
        assert 'int' in str(type(action)), 'action must be an int, not a ' + str(type(action))

        next_obs, reward, done, _ = envInter.take_action(action, env)
        obsBuf.add(next_obs)
        net_input_next = obsBuf.get()
        ep_t += 1
        ep_r += reward
        reward = np.clip(reward, -1, 1)

        # Save experiences
        experience = {'obs':[net_input], 'action':action, 'reward':reward,
                      'next_obs':[net_input_next], 'done':done}
        expBuf.add(experience)
        net_input = net_input_next
        
        ## Update Weights
        if f % update_frequency == 0 \
                and f > minimum_experience:
            train_batch = expBuf.sample(batch_size)

            # Compute ... TODO: add description
            [action] = sess.run([DQN1.action],
                feed_dict={obsPlaceholder : train_batch['next_obs']})
            [actualQs] = sess.run([DQN2.Qout],
                feed_dict={obsPlaceholder : train_batch['next_obs']})

            actualQ = actualQs[range(batch_size), action] 
                # The DQN2 Q value of the action chosen by DQN1
            zero_if_done = train_batch['done']*(-1) + 1 # Used to remove the actualQ
                                                        # When at terminal state
            target = train_batch['reward'] + gamma*actualQ*zero_if_done

            ## Update DQN1
            DQN1_train_dict = {
                        obsPlaceholder : train_batch['obs'],
                        actionPlaceholder : train_batch['action'],
                        targetQPlaceholder : target
                }
            loss, _ = sess.run([DQN1.loss, DQN1.train_op], feed_dict=DQN1_train_dict)
            
            ## Update DQN2
            updateTarget(targetOps,sess)          
        
        ## Track training
        if f % track_interval == 0:
            if f % (track_interval*10) == 0:
                print('\n{:5} {:>9}  {:>7}  {:>7}  {:>7}'.format(
                    'time', 'frames', 'epis', 'reward', 'epsilon'))
            else:
                print('{:5} {:9}, {:7}, {:7.1f}, {:7.3f}'.format(
                      utils.current_time(),
                      f, 
                      training_summaries['num_ep'],
                      np.mean(training_summaries['ep_rewards'][-100:]),
                      training_summaries['epsilon'][-1],
                     ), end='')
            if f % (track_interval*10) == 0 and f>0:
                i = training_summaries['num_ep']
                model_save_str = path+'/model-'+str(i)+'.cptk'
                saver.save(sess, model_save_str)
                print("Saved Model: " + model_save_str)
#                 break
            else:
                print()
            
        
        if training_summaries['num_ep'] % render_interval == 0:
            env.render()

        if ep_t > max_ep_t:
            done = True
            
        if done:
            training_summaries['num_ep'] += 1
            training_summaries['ep_rewards'].append(ep_r)
            training_summaries['ep_length'].append(ep_t)
            training_summaries['epsilon'].append(epsilon)

            obsBuf.reset()
            obs, ep_t, ep_r = envInter.reset(env)
            obsBuf.add(obs)
            net_input = obsBuf.get()
#             break

            
        
            
    except KeyboardInterrupt:
        break
        
print('f', f)
env.render(close=True)
print('\nTerminated')

In [None]:
## Visualize training_summaries

print('num_ep', training_summaries['num_ep'])

fig, ax = plt.subplots(2,2)
fig.suptitle('Preprocessor_2d test')
ax[0,0].plot(training_summaries['ep_rewards'])
ax[0,0].set_title('ep_rewards')
ax[0,1].plot(training_summaries['ep_length'])
ax[0,1].set_title('ep_length')

ax[1,1].plot(training_summaries['epsilon'], c='k')
ax[1,1].set_title('epsilon')
ax[1,1].set_xlim([0, None])
fig.tight_layout()
plt.show()

In [None]:
## Run and render a forward pass
import time

env = gym.make(env_name)
env.render()

obs, ep_t, ep_r = envInter.reset(env)
obsBuf.reset()
obsBuf.add(obs)
obs_input = obsBuf.get()


while True:
    try:
        if np.random.rand(1) > 0.1:
            action, Qout = sess.run([DQN1.action, DQN1.Qout], feed_dict={obsPlaceholder : [obs_input]})
            print(' ', end=' ')
        else:
            action = np.random.randint(a_size)
#             print('R', end=' ')
        action = int(action)
#         print('action', action, '  Qout', Qout)
        
        assert 'int' in str(type(action))
        time.sleep(.015)

        next_obs, reward, done, _ = envInter.take_action(action, env)
        obsBuf.add(next_obs)
        obs_input = obsBuf.get()
        ep_t += 1
        ep_r += reward

        env.render()
        if ep_t > max_ep_t:
            print('max_ep_t exerted!')
            done = True

        if done:
            print(ep_r)
            obs, ep_t, ep_r = envInter.reset(env)
            obsBuf.reset()
            obsBuf.add(obs)
            obs_input = obsBuf.get()

    except KeyboardInterrupt:
        break

env.render(close=True)
print('Terminated')

In [None]:
sess.close()
env.render(close=True)


In [None]:
## Some memory evaluation, that doesn't seem to work...
import sys

if env_name == 'Breakout-v0':
    def sizeof_fmt(num, suffix='B'):
        for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
            if abs(num) < 1024.0:
                return "%3.1f%s%s" % (num, unit, suffix)
            num /= 1024.0
        return "%.1f%s%s" % (num, 'Yi', suffix)

    experience_size = 0
    experience_size += sys.getsizeof(expBuf.buffer[0]['action'])
    experience_size += sys.getsizeof(expBuf.buffer[0]['done'])
    experience_size += sys.getsizeof(expBuf.buffer[0]['obs'][0])
    experience_size += sys.getsizeof(expBuf.buffer[0]['next_obs'][0])

    total_ram = 30 * 1024 * 1024 * 1024
    buffer_len = expBuf.buffer_size()

    print('buffer len', buffer_len)
    print('experience_size', sizeof_fmt(experience_size))
    print('buffer memory', sizeof_fmt(experience_size*buffer_len))
    print('total ram', sizeof_fmt(total_ram))
    print('buffer max', sizeof_fmt(experience_size * replay_buffer_size))
    print('Deepmind memory', sizeof_fmt(experience_size * 1e6))

    print(total_ram)

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>