### Done
* Forward pass
* Backwards pass
* Summaries
* Replay buffer
* target network
* Save model
* Breakout 
* preprocessing
* Action repeats
* Set max episode length! (double of the mean?)
* Encapsulate training in a class (or something!)


### Missing
* TensorBoard Summaries
    * Track episode lengths!
    * Create a test and a training mode
* Have a configure file that is saved along with the experiment
    * Be able to load config files!
* DQN: use int8 to store the experiences? ~ x4 memory saver
* Let it train + collect data for report!

### Notes
Inspired by: https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-4-deep-q-networks-and-beyond-8438a3e2b8df

# DQN on Breakout



In [None]:
%%javascript

Jupyter.keyboard_manager.command_shortcuts.add_shortcut('r', {
    help : 'run all cells',
    help_index : 'zz',
    handler : function (event) {
        IPython.notebook.execute_all_cells();
        return false;
    }
});

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

%matplotlib inline
import os
import shutil
import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf
import gym

import DQNmodel
import utils

In [None]:
env_name = 'CartPole-v1'

## Hyperparameters
IS_DEBUGGING = True # Make progam run faster
IS_DEBUGGING = False


# Script behavior
logdir = './logdir/'+env_name+'/DQN/' + utils.time_str()
model_type = 'dense'


# Model
gamma = 0.99
net_in_dim = [4]
preprocessor = None
action_repeats = 1 # number of env steps to repeat action


# Training
max_ep_t = np.inf
update_frequency = 4 # How often to update parameters
batch_size = 32
learning_rate = 1e-3
track_interval = 10000 # frames
render_interval = 500 # episodes
replay_buffer_size = int(1e6)
epsilon_init = 1.0
epsilon_end = 0.1
epsilon_test = 0.01
annealing_period = 1e6 # num frames to anneal over, after buffer is full

## Various tests
assert update_frequency > 1, "The dimensions doen't allign if update_frequency" \
    + 'is <= than 1. update_frequency: ' + str(update_frequency)

In [None]:
if IS_DEBUGGING:
    track_interval = 5000 # frames
    replay_buffer_size = int(5e4)
    annealing_period = 1e5 # frames


In [None]:
## Derived settings
tf.reset_default_graph()

env = gym.make(env_name)
obs = env.reset()
s_size = list(obs.shape) # Just fun fact, not actually used
a_size = env.action_space.n
minimum_experience = replay_buffer_size
print('-- Environmental variables --')
print('env_name  ', env_name)
print('model_type', model_type)
print('s_size    ', s_size)
print('net_in_dim', net_in_dim) # Shape after (optional) preprocessing
print('a_size    ', a_size)

obsPlaceholder = tf.placeholder(tf.float32, shape=[None]+net_in_dim, name='obsPlaceholder')
# Assume action is encoded as ONE number
actionPlaceholder = tf.placeholder(tf.int32, shape=[None], name='actionPlaceholder')
targetQPlaceholder = tf.placeholder(tf.float32, shape=[None], name='targetQPlaceholder')

print('\n-- Placeholders --')
print('obsPlaceholder    ', obsPlaceholder.get_shape())
print('actionPlaceholder ', actionPlaceholder.get_shape())
print('targetQPlaceholder', targetQPlaceholder.get_shape())
print()

### Create model
with tf.name_scope('DQN1'):
    DQN1 = DQNmodel.DQN(model_type, obsPlaceholder, actionPlaceholder, a_size)
    DQN1.create_MSE_train_op(targetQPlaceholder, learning_rate=learning_rate)

with tf.name_scope('DQN2'):
    DQN2 = DQNmodel.DQN(model_type, obsPlaceholder, actionPlaceholder, a_size)
    DQN2.create_MSE_train_op(targetQPlaceholder, learning_rate=learning_rate)

print('Model summary')
DQN1.model.summary()

In [None]:
# Create helper objects
envInter = utils.EnvironmentInterface(
        preprocessor=preprocessor, action_repeats=action_repeats,\
        merge_frames=False)
obsBuf = utils.ObsBuffer(obs_shape=net_in_dim)
expBuf = utils.Experience_buffer(replay_buffer_size)
annealer = utils.Annealer(epsilon_init, epsilon_end, annealing_period)
saver = tf.train.Saver()

In [None]:
## In case of restart
expBuf.clear()

# TODO: Use TF summaries, this is shit
training_summaries = {}
training_summaries['num_ep'] = 0
training_summaries['ep_rewards'] = []
training_summaries['ep_length'] = []
training_summaries['epsilon'] = []

load_model = False

In [None]:
# load_model = True

In [None]:
# sess.close()

In [None]:
## Initialize sess and Trainer
try:
    if(sess._closed):
        print('Restarting session.')
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        trainer = DQNmodel.Trainer(obsBuf, expBuf, annealer, envInter, env, logdir, saver, DQN1, DQN2,
            minimum_experience, update_frequency, track_interval, gamma, max_ep_t)
        load_model = False
    else:
        print('Reusing existing session')
except NameError:
    print('Starting a new session.')
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    trainer = DQNmodel.Trainer(obsBuf, expBuf, annealer, envInter, env, logdir, saver, DQN1, DQN2,
        minimum_experience, update_frequency, track_interval, gamma, max_ep_t)
    load_model = False

    

trainer.train(sess, training_summaries, epsilon_test, render_interval=50, 
          obsPlaceholder=obsPlaceholder, actionPlaceholder=actionPlaceholder, 
          targetQPlaceholder=targetQPlaceholder, batch_size=batch_size,
          load_model=load_model)

load_model = True

In [None]:
## Visualize training_summaries

print('num_ep', training_summaries['num_ep'])

fig, ax = plt.subplots(2,2)
fig.suptitle('Preprocessor_2d test')
ax[0,0].plot(training_summaries['ep_rewards'])
ax[0,0].set_title('ep_rewards')
ax[0,1].plot(training_summaries['ep_length'])
ax[0,1].set_title('ep_length')

ax[1,1].plot(training_summaries['epsilon'], c='k')
ax[1,1].set_title('epsilon')
ax[1,1].set_xlim([0, None])
fig.tight_layout()
plt.show()

In [None]:
# ## Run and render a forward pass
# import time

# env = gym.make(env_name)
# env.render()

# obs, ep_t, ep_r = envInter.reset(env)
# obsBuf.reset()
# obsBuf.add(obs)
# obs_input = obsBuf.get()


# while True:
#     try:
#         if np.random.rand(1) > epsilon_test:
#             action, Qout = sess.run([DQN1.action, DQN1.Qout], feed_dict={obsPlaceholder : [obs_input]})
#             print(' ', end=' ')
#         else:
#             action = np.random.randint(a_size)
# #             print('R', end=' ')
#         action = int(action)
# #         print('action', action, '  Qout', Qout)
        
#         assert 'int' in str(type(action))
#         time.sleep(.015)

#         next_obs, reward, done, _ = envInter.take_action(action, env)
#         obsBuf.add(next_obs)
#         obs_input = obsBuf.get()
#         ep_t += 1
#         ep_r += reward

#         env.render()
#         if ep_t > max_ep_t:
#             print('max_ep_t exerted!')
#             done = True

#         if done:
#             print(ep_r)
#             obs, ep_t, ep_r = envInter.reset(env)
#             obsBuf.reset()
#             obsBuf.add(obs)
#             obs_input = obsBuf.get()

#     except KeyboardInterrupt:
#         break

# env.render(close=True)
# print('Terminated')

In [None]:
# sess.close()
# env.render(close=True)


In [None]:
# ## Some memory evaluation, that doesn't seem to work...
# import sys

# if env_name == 'Breakout-v0':
#     def sizeof_fmt(num, suffix='B'):
#         for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
#             if abs(num) < 1024.0:
#                 return "%3.1f%s%s" % (num, unit, suffix)
#             num /= 1024.0
#         return "%.1f%s%s" % (num, 'Yi', suffix)

#     experience_size = 0
#     experience_size += sys.getsizeof(expBuf.buffer[0]['action'])
#     experience_size += sys.getsizeof(expBuf.buffer[0]['done'])
#     experience_size += sys.getsizeof(expBuf.buffer[0]['obs'][0])
#     experience_size += sys.getsizeof(expBuf.buffer[0]['next_obs'][0])

#     total_ram = 30 * 1024 * 1024 * 1024
#     buffer_len = expBuf.buffer_size()

#     print('buffer len', buffer_len)
#     print('experience_size', sizeof_fmt(experience_size))
#     print('buffer memory', sizeof_fmt(experience_size*buffer_len))
#     print('total ram', sizeof_fmt(total_ram))
#     print('buffer max', sizeof_fmt(experience_size * replay_buffer_size))
#     print('Deepmind memory', sizeof_fmt(experience_size * 1e6))

#     print(total_ram)

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>