In [None]:
#=======================================#
# Yes, this notebook is over-commented. #
#=======================================#

In [1]:
# Make notebook span entire screen, horizontally.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [28]:
import gym
import numpy as np
import tensorflow as tf

In [45]:
class PolicyAgent(object):
    def __init__(self, sess):
        self.num_actions = 4
        self._build()
        self.sess = sess
        
    def _build(self):
        self.actions      = tf.placeholder(tf.int32, (None, 1))
        self.columns      = tf.placeholder(tf.int32, (None, 1))
        self.e_weight     = tf.placeholder(tf.float32)
        self.l_rate       = tf.placeholder(tf.float32)
        self.observations = tf.placeholder(tf.float32, (None, 8))
        self.target       = tf.placeholder(tf.float32, (None, 1))
        self.training     = tf.placeholder(tf.bool)
        
        with tf.variable_scope('actor-hidden'):
            h1    = tf.layers.dense(self.observations, 128, 
                                    activation=tf.nn.relu, 
                                    kernel_initializer=tf.contrib.layers.xavier_initializer(), 
                                    name='h1')
            
            drop1 = tf.layers.dropout(h1, training=self.training, name='drop1')
            
            h2    = tf.layers.dense(drop1, 64,
                                    activation=tf.nn.relu, 
                                    kernel_initializer=tf.contrib.layers.xavier_initializer(), 
                                    name='h2')
            
            drop2 = tf.layers.dropout(h2, training=self.training, name='drop2')
            
            h3    = tf.layers.dense(drop2, 32,
                                    activation=tf.nn.relu, 
                                    kernel_initializer=tf.contrib.layers.xavier_initializer(), 
                                    name='h3')
            
            drop3 = tf.layers.dropout(h3, training=self.training, name='dropout')
        
            self.out = tf.layers.dense(drop3, self.num_actions,
                                       kernel_initializer=tf.random_normal_initializer(), 
                                       name='out')
        
        # Compute probabilities associated with each action.
        self.probabilities = tf.clip_by_value(tf.nn.softmax(self.out), 1e-10, 1.0)
        
        # Compute entropy based on action probabilities.
        self.entropy = -tf.reduce_sum(self.probabilities * tf.log(self.probabilities), 1, name="entropy")
        
        # Compute losses of action probabilities associated with each observation in a single batch.
        indices = tf.concat(values=[self.columns, self.actions], axis=1)
        self.picked_action_prob = tf.gather_nd(self.probabilities, indices)
        self.losses = -(tf.log(self.picked_action_prob) * self.target + self.entropy * self.e_weight)
        
        # Compute batch loss.
        self.loss = tf.reduce_sum(self.losses)
        
        # Set optimizer.
        self.train_op = tf.train.AdamOptimizer(self.l_rate).minimize(self.loss)
    
    # NOTE: computing `out` from `self.out` is not necessary -- just for debugging
    def choose_action(self, obs, verbose=False):
        # Compute probabilities associated with each action and output layer node values.
        out, probs = self.sess.run([self.out, self.probabilities], feed_dict={
            self.observations: np.array(obs).reshape(-1, 8),
            self.training:     False
        })
        
        if verbose: print(probs, out)
            
        # Choose action based on computed probabilities.
        return np.random.choice(range(probs.shape[1]), p=probs.ravel())
    
    def train(self, act, obs, target, l_rate, e_weight):
        length = np.array(act).reshape(-1, 1).shape[0]
        
        inp = (self.train_op, self.loss, self.probabilities, self.entropy, self.out)
        
        _, *results = self.sess.run(inp, feed_dict={
            self.actions:      np.array(act).reshape(-1, 1),
            self.columns:      np.arange(length).reshape(-1, 1),
            self.e_weight:     e_weight,
            self.l_rate:       l_rate,
            self.observations: np.array(obs).reshape(-1, 8),
            self.target:       np.array(target).reshape(-1, 1),
            self.training:     True
        })
        
        # print('-' * 32)
        # print('\n'.join(results))
        # print('-' * 32)

        return results[0]


In [38]:
class Critic(object):
    def __init__(self, sess):
        self.sess = sess
        self._build()
        
    def _build(self):
        self.l_rate       = tf.placeholder(tf.float32)
        self.observations = tf.placeholder(tf.float32, (None, 8))
        self.target       = tf.placeholder(tf.float32, (None, 1))
        
        with tf.variable_scope('critic-hidden'):
            h1  = tf.layers.dense(self.observations, 128,
                                  activation=tf.nn.relu,
                                  kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                  name='h1')
            
            h2  = tf.layers.dense(h1, 64,
                                  activation=tf.nn.relu,
                                  kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                  name='h2')
            
            out = tf.layers.dense(h2, 1,
                                  kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                  name='out')
            
        self.value_estimate = tf.squeeze(out) # [[num]] -> num
        self.loss = tf.squared_difference(self.value_estimate, self.target)
        self.train_op = tf.train.AdamOptimizer(self.l_rate).minimize(self.loss)
        
    def predict(self, obs):
        return sess.run(self.value_estimate, feed_dict={
            self.observations: np.array(obs).reshape(-1, 8)
        })
    
    def update(self, obs, target, l_rate=0.01):
        inp = (self.train_op, self.loss)
        
        _, *results = self.sess.run(inp, feed_dict={
            self.l_rate:       l_rate,
            self.observations: np.array(obs).reshape(-1, 8),
            self.target:       np.array(target).reshape(-1, 1)
        })
        
        return results[0]
    

In [46]:
class ACHandler(object):
    def __init__(self, actor, critic, env, sess, path='./.model.ckpt'):
        self.actor = actor
        self.critic = critic
        self.env = env
        self.sess = sess
    
        self.saver = tf.train.Saver()
        self.path = path

    def init_vars(self):
        self.sess.run(tf.global_variables_initializer())

    def run(self, train_func, rollout=100, a_rate=0.001, c_rate=0.005, decay=0.99, render=False, e_weight=0.007, **kwargs):
        assert isinstance(train_func, str) and train_func.startswith('train_'), \
               'invalid train_func name specified'
        getattr(self, train_func)(self.rollout(rollout, render, decay), a_rate, c_rate, e_weight=e_weight, **kwargs)
        
        # Close the display window
        if render: self.env.close()
            
    def run_constant_training(self, num_episodes, a_rate=0.001, c_rate=0.005, decay=0.99, e_weight=0.007, render=False):
        """
        Runs training and updates both networks during every time step
        """
        for _ in range(num_episodes):
            obs_curr = env.reset()
            done = False

            while not done:

                if render: self.env.render()
                action = self.actor.choose_action(obs_curr)

                # Take action in environment.
                next_obs, reward, done, _ = self.env.step(action)

                next_estimate = self.critic.predict(next_obs)
                td_target = reward + decay*next_estimate
                td_error = td_target - self.critic.predict(obs_curr)
                c_loss = self.critic.update(obs_curr, td_target, c_rate)
                a_loss = self.actor.train(action, obs_curr, td_error, a_rate, e_weight)

                obs_curr = next_obs
                
    def play(self):
        """
        Runs a single instance of the game without training or storing training information
        Always displays the game and closes the window afterward
        """
        obs_curr = self.env.reset()
        done = False
        rewards = 0
        while not done:
            self.env.render()

            # Agent chooses action based on difference frame.
            action = self.actor.choose_action(obs_curr)

            # Take action in environment.
            obs_curr, reward, done, _ = self.env.step(action)
            
            rewards += reward
        print('Total Reward for Episode: {}'.format(rewards))
        env.close()
        
    def train_rsample(self, batch, a_rate, c_rate, e_weight=0.007, num_epochs=50, mini_batch_size=100):
        """
        Performs random mini-batch training on both networks from a given
          set of batch information
        """
        for x in range(num_epochs):
            indices = np.random.randint(len(batch['obs']), size=mini_batch_size)
            loss = self.actor.train([batch['act'][i] for i in indices],
                             [batch['obs'][i] for i in indices],
                             [batch['advantage'][i] for i in indices],
                             a_rate,
                             e_weight)
            self.critic.update([batch['obs'][i] for i in indices],
                               [batch['td_target'][i] for i in indices],
                               c_rate)
 
    def train_all(self, batch, a_rate, c_rate, e_weight=0.007):
        """
        Trains both networks on all peices of inromation in the batch
        """
        self.actor.train(batch['act'],
                         batch['obs'],
                         batch['advantage'],
                         a_rate,
                         e_weight)
        self.critic.update(batch['obs'],
                           batch['td_target'],
                           c_rate)
    
    def compute_advantage(self, obs, rewards, decay):
        policy_target = np.zeros_like(rewards)
        value_target = np.zeros_like(rewards)
        running_reward = 0
        

        for idx in range(len(rewards))[:-1]:
            next_estimate = self.critic.predict(obs[idx+1])
            td_target = rewards[idx] + decay*next_estimate
            td_error = td_target - self.critic.predict(obs[idx])
            
            policy_target[idx] = td_error
            value_target[idx] = td_target
        
        return policy_target.tolist(), value_target.tolist()

    def save(self):
        self.saver.save(self.sess, self.path)
        
    def load(self):
        self.saver.restore(self.sess, self.path)
            
    def rollout(self, count, render, decay):
        batch = {'act': [], 'obs': [], 'rew': [], 'advantage':[], 'td_target':[]}
        
        for episode in range(count):
            # Stores all the stuff
            history = {'act': [], 'obs': [], 'rew': [], 'advantage':[], 'td_target':[]}
            
            obs_curr = env.reset()
            done = False
               
            while not done:
                
                if render: self.env.render()
                # Agent chooses action based on difference frame.
                action = self.actor.choose_action(obs_curr, False)
        
                # Take action in environment.
                next_obs, reward, done, _ = self.env.step(action)
                
                history['act'].append(action)
                history['obs'].append(obs_curr)
                history['rew'].append(reward)
                
                obs_curr = next_obs

            # Process rewards per episode.
            history['advantage'], history['td_target'] = self.compute_advantage(history['obs'] + obs_curr, history['rew'], decay)
            # Add episode to batch.
            for key in batch:
                batch[key].extend(history[key])
        return batch 

In [47]:
tf.reset_default_graph()
env = gym.make('LunarLander-v2') # RGB observation space

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [48]:
sess = tf.Session()
actor = PolicyAgent(sess)
critic = Critic(sess)

In [49]:
handler = ACHandler(actor, critic, env, sess, '.models/l1.cpt')

In [53]:
handler.init_vars()

In [110]:
while(True):
    for _ in range(100):
        handler.run('train_rsample', rollout=40, a_rate=0.001, c_rate=0.01, decay=0.97, e_weight=0.007, render=False,
                   num_epochs=100, mini_batch_size=100)
        print('-',end='')
    print('\nCompleted 100 Training Iterations\n')
    handler.save()

InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder_6' with dtype float
	 [[Node: Placeholder_6 = Placeholder[dtype=DT_FLOAT, shape=<unknown>, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'Placeholder_6', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/tornado/platform/asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.5/asyncio/base_events.py", line 345, in run_forever
    self._run_once()
  File "/usr/lib/python3.5/asyncio/base_events.py", line 1312, in _run_once
    handle._run()
  File "/usr/lib/python3.5/asyncio/events.py", line 125, in _run
    self._callback(*self._args)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/tornado/platform/asyncio.py", line 117, in _handle_events
    handler_func(fileobj, events)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-107-c297ec90c424>", line 2, in <module>
    actor = PolicyAgent(sess)
  File "<ipython-input-89-edf74993c741>", line 10, in __init__
    self._build()
  File "<ipython-input-89-edf74993c741>", line 23, in _build
    self.e_weight = tf.placeholder(tf.float32)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 1808, in placeholder
    return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 4848, in placeholder
    "Placeholder", dtype=dtype, shape=shape, name=name)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/home/grberlstein/rl_bot/.env/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'Placeholder_6' with dtype float
	 [[Node: Placeholder_6 = Placeholder[dtype=DT_FLOAT, shape=<unknown>, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [None]:
while(True):
    for _ in range(100):
        handler.run('train_all', rollout=30, a_rate=0.001, c_rate=0.005, decay=0.99, render=False)
        print('-',end='')
    print('Completed 100 Training Iterations\n')
    handler.save()

In [None]:
handler.run_constant_training(10000,render=True, decay=0.99, a_rate=0.002, c_rate=0.01, e_weight=0.01)

In [109]:
handler.load()

INFO:tensorflow:Restoring parameters from .models/l1.cpt


In [None]:
handler.save()

In [85]:
while 1: handler.play()

Total Reward for Episode: -46.34610782563561
Total Reward for Episode: -215.93202158955063
Total Reward for Episode: -61.82729807257837
Total Reward for Episode: 107.70510878849339
Total Reward for Episode: -272.9352045647684
Total Reward for Episode: 84.7154958241119
Total Reward for Episode: -94.80646271212674
Total Reward for Episode: -104.57005108570428
Total Reward for Episode: -68.05701209406867
Total Reward for Episode: 126.80516327854224
Total Reward for Episode: -113.74487616521319
Total Reward for Episode: -245.89870312583298
Total Reward for Episode: 52.90281767657473
Total Reward for Episode: -73.00809814949372
Total Reward for Episode: -92.42599453655066
Total Reward for Episode: -121.58708441703564
Total Reward for Episode: -134.75952039104175
Total Reward for Episode: -59.72341063114468
Total Reward for Episode: -102.5078249677441
Total Reward for Episode: -82.0380089023412
Total Reward for Episode: -93.89252544934656
Total Reward for Episode: -81.26268950021465
Total Re

KeyboardInterrupt: 