In [1]:
#=======================================#
# Yes, this notebook is over-commented. #
#=======================================#

In [2]:
# Make notebook span entire screen, horizontally.
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

In [3]:
import gym
import numpy as np
import tensorflow as tf

In [4]:
def process_rewards(rewards, decay, norm):
    """
    Apply decay to raw rewards and maybe normalize.
    """
    
    discounted = np.zeros_like(rewards)
    running_reward = 0
        
    for idx in reversed(range(len(rewards))):
        running_reward += rewards[idx]
        running_reward *= decay
        discounted[idx] = running_reward
            
    if norm:
        discounted -= np.mean(discounted)
        if np.std(discounted) != 0:
            discounted /= np.std(discounted)

    return discounted.tolist()

In [5]:
class Actor(object):
    def __init__(self, sess):
        self.num_actions = 4
        self.sess = sess
        
        self._build()
        
    def _build(self):
        self.actions      = tf.placeholder(tf.int32, (None, 1))
        self.columns      = tf.placeholder(tf.int32, (None, 1))
        self.e_encr       = tf.placeholder(tf.float32)
        self.l_rate       = tf.placeholder(tf.float32)
        self.min_max_w    = tf.placeholder(tf.float32)
        self.observations = tf.placeholder(tf.float32, (None, 8))
        self.target       = tf.placeholder(tf.float32, (None, 1))
        self.training     = tf.placeholder(tf.bool)
        
        with tf.variable_scope('actor-hidden'):
            h1    = tf.layers.dense( \
                        self.observations,
                        256, 
                        activation=tf.nn.relu, 
                        kernel_initializer=tf.contrib.layers.xavier_initializer(), 
                        name='h1')
            
            drop1 = tf.layers.dropout(h1, training=self.training, name='drop1')
            
            h2    = tf.layers.dense( \
                        drop1,
                        128,
                        activation=tf.nn.relu, 
                        kernel_initializer=tf.contrib.layers.xavier_initializer(), 
                        name='h2')
            
            drop2 = tf.layers.dropout(h2, training=self.training, name='drop2')
            
            h3    = tf.layers.dense( \
                        drop2,
                        64,
                        activation=tf.nn.relu, 
                        kernel_initializer=tf.contrib.layers.xavier_initializer(), 
                        name='h3')
            
            drop3 = tf.layers.dropout(h3, training=self.training, name='dropout')
        
            out   = tf.layers.dense( \
                        drop3,
                        self.num_actions,
                        kernel_initializer=tf.contrib.layers.xavier_initializer(), 
                        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=0.1),
                        name='out')
        
        # Compute probabilities associated with each action. Clipping avoids
        # passing 0 to log.
        self.probabilities = tf.clip_by_value(tf.nn.softmax(out), 1e-10, 1.0)
        
        # Compute entropy of action probabilities.
        self.entropy = -tf.reduce_sum(self.probabilities * tf.log(self.probabilities), 1, name="entropy")
        
        # Compute punishment based on the difference between the min and max
        # action probability to modulate action probability dominance.
        min_max_punishment = tf.reduce_max(self.probabilities,axis=1) - tf.reduce_min(self.probabilities,axis=1)
        
        # Collect probability of chosen action for each timestep.
        indices = tf.concat(values=[self.columns, self.actions], axis=1)
        self.picked_action_prob = tf.gather_nd(self.probabilities, indices)
        
        self.timestep_losses = ( \
            # Compute loss.
            -tf.log(self.picked_action_prob) * self.target -
            
            # Incentivize entropy.
            self.entropy * self.e_encr +
            
            # Disincentivize dominant action probabilities.
            min_max_punishment * self.min_max_w +
            
            # Regularize.
            tf.losses.get_regularization_loss())
        
        # Compute batch loss.
        self.loss = tf.reduce_mean(self.timestep_losses)
        
        # Set optimizer.
        self.train_op = tf.train.AdamOptimizer(self.l_rate).minimize(self.loss)
    
    def choose_action(self, obs, verbose=False):
        """
        Given an observation, use the policy network to determine what action to take.
        """
        
        debug = (self.entropy, )
        
        # Compute probabilities associated with each action.
        probs, *results = self.sess.run((self.probabilities,) + (debug if verbose else ()), feed_dict={
            self.observations: np.array(obs).reshape(-1, 8),
            self.training:     False
        })
        
        if verbose: print(probs, *results)
            
        # Choose action based on computed probabilities.
        return np.random.choice(range(probs.shape[1]), p=probs.ravel())
    
    def train(self, act, obs, target, l_rate, e_encr, min_max_w, verbose):
        """
        Train policy network on a timestep of data.
        """
        
        length = np.array(act).reshape(-1, 1).shape[0]
        debug = (self.loss, self.entropy, self.probabilities)
        
        # Only store debugging info in results.
        _, *results = self.sess.run( \
                          (self.train_op,) + (debug if verbose else ()),
                          feed_dict = {
                              self.actions:      np.array(act).reshape(-1, 1),
                              self.columns:      np.arange(length).reshape(-1, 1),
                              self.e_encr:       e_encr,
                              self.l_rate:       l_rate,
                              self.observations: np.array(obs).reshape(-1, 8),
                              self.target:       np.array(target).reshape(-1, 1),
                              self.training:     True,
                              self.min_max_w:    min_max_w
                          })

        return results

In [6]:
class Critic(object):
    def __init__(self, sess):
        self.sess = sess
        
        self._build()
        
    def _build(self):
        self.l_rate       = tf.placeholder(tf.float32)
        self.observations = tf.placeholder(tf.float32, (None, 8))
        self.target       = tf.placeholder(tf.float32, (None, 1))
        self.training     = tf.placeholder(tf.bool)
        
        with tf.variable_scope('critic-hidden'):
            h1    = tf.layers.dense( \
                        self.observations,
                        256,
                        activation=tf.nn.relu,
                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                        name='h1')
            
            drop1 = tf.layers.dropout(h1, name='drop1', training=self.training)
            
            h2    = tf.layers.dense( \
                        drop1,
                        128,
                        activation=tf.nn.relu,
                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                        name='h2')
            
            drop2 = tf.layers.dropout(h2, name='drop2', training=self.training)
            
            out   = tf.layers.dense( \
                        drop2,
                        1,
                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                        name='out')
        
        # Output layer is a single node. Squeezing unpacks the value estimation
        # from that node.
        self.value_estimate = tf.squeeze(out)
        
        # Compute loss for each timestep in batch.
        self.losses = tf.squared_difference(self.value_estimate, self.target)
        
        # Compute batch loss.
        self.loss = tf.reduce_mean(self.losses)

        # Set optimizer.
        self.train_op = tf.train.AdamOptimizer(self.l_rate).minimize(self.loss)
        
    def predict(self, obs):
        """
        Estimate value of current state of simulation (i.e. an observation).
        """
        
        return sess.run(self.value_estimate, feed_dict={
            self.observations: np.array(obs).reshape(-1, 8),
            self.training:     False
        })
    
    def update(self, obs, target, l_rate, verbose):
        """
        Train critic network on a timestep of data.
        """
        
        debug = (self.loss,)
        
        # Only store debugging info in results.
        _, *results = self.sess.run( \
                          (self.train_op,) + (debug if verbose else ()),
                          feed_dict = {
                              self.l_rate:       l_rate,
                              self.observations: np.array(obs).reshape(-1, 8),
                              self.target:       np.array(target).reshape(-1, 1),
                              self.training:     True
                          })
        
        return results

In [7]:
class Handler(object):
    def __init__(self, actor, critic, env, sess, path='./.model.ckpt'):
        self.actor = actor
        self.critic = critic
        self.env = env
        self.sess = sess
    
        self.saver = tf.train.Saver()
        self.path = path

    def init_vars(self):
        self.sess.run(tf.global_variables_initializer())

    def run(self,
            
            # Specifies which training function to use.
            train_fn_name,
        
            # Number of episodes per batch. Not used with batch-less training.
            rollout  = 100,
            
            # Number of total episodes to train on. Not used with batch training.
            episodes = 1000,
            
            # Learning rate of actor.
            a_rate   = 0.001,
            
            # Learning rate of critic.
            c_rate   = 0.005,
            
            # Weight with which to augment entropy-based encouragement.
            e_encr   = 0.007,
            
            # Reward decay.
            decay    = 0.99,
            
            # Weight to augment punishment based on maximum and minimum
            # ouput probability
            min_max_w = 1,
            
            # Normalize rewards per episode?
            norm     = False,
            
            # Render simulation to screen?
            render   = False,
            
            # Print debugging info?
            verbose  = False,
            
            # Hyperparameters specific to a training function.
            **kwargs
        ):
        
        """
        Run an arbitrary training function.
        
        The run method specifies defaults for hyperparameters common to all
        training functions. Parameters specific to individual training functions
        are passed along in kwargs.
        
        Episodes are only used for batch-less training and rollout is only used
        for batch-based training.
        """
        
        # All training functions must be prefixed with 'train_'.
        assert isinstance(train_fn_name, str) and train_fn_name.startswith('train_'), \
               'Invalid train_func name specified.'
        
        # Retreive training function.
        train_fn = getattr(self, train_fn_name)
        
        # Provide total number of episodes to run for batch-less training
        # or do rollout for batch training.
        if train_fn_name == 'train_constant':
            train_fn(episodes, render, a_rate, c_rate, decay, e_encr, min_max_w, verbose, **kwargs)
        else:
            train_fn(self.rollout(rollout, render, decay, norm), a_rate, c_rate, e_encr, min_max_w, verbose, **kwargs)
        
        # Close the display window.
        if render: self.env.close()
            
    def train_constant(self,
            
            # Total number of episodes to train on.
            num_episodes,
                       
            render,
            
            # Hyperparameters.
            a_rate, c_rate, decay, e_encr, min_max_w,
                       
            verbose
        ):
        
        """
        Run actor and train on every timestep.
        
        This training method does not use the default run method because run is
        built for batched training and this method does not use batches.
        """

        for ep in range(num_episodes):
            obs_curr = env.reset()
            done = False

            if verbose:
                a_episode_loss = []
                c_episode_loss = []
                rewards = 0
            
            while not done:
                if render: self.env.render()
                    
                # Actor chooses action.
                action = self.actor.choose_action(obs_curr)

                # Actor takes action in environment.
                next_obs, reward, done, _ = self.env.step(action)

                # Critic estimates value of next state.
                next_estimate = self.critic.predict(next_obs)
                
                # Compute value of current state assuming critic is accurate.
                td_target = reward + decay * next_estimate
                
                # Compare indirect value estimation with direct value estimation of current state.
                td_error = td_target - self.critic.predict(obs_curr)
                
                # Train actor and critic.
                c_debug = self.critic.update(obs_curr, td_target, c_rate, verbose)
                a_debug = self.actor.train(action, obs_curr, td_error, a_rate, e_encr, min_max_w, verbose)
                
                if verbose:
                    # Get loss from debug info.
                    a_episode_loss.append(a_debug[0])
                    c_episode_loss.append(c_debug[0])
                    
                    rewards += reward

                obs_curr = next_obs
                
            if verbose:
                print('Actor Episode Loss: {:14.7f}'.format(np.mean(a_episode_loss)), end='; ')
                print('Critic Episode Loss: {:14.7f}'.format(np.mean(c_episode_loss)), end='; ')
                print('Episode Reward: {:14.7f}'.format(rewards))

    def train_rsample(self,
            
            # Default batch-training parameters.
            batch, a_rate, c_rate, e_encr, min_max_w, verbose,
            
            # Rounds of training to perform.
            num_epochs      = 50,
            
            # Number of random timesteps to train on.
            mini_batch_size = 100
        ):
        
        """
        Performs random mini-batch training on both networks given a batch of
        episode information.
        """
        
        if verbose:
            a_batch_loss = []
            c_batch_loss = []
        
        for _ in range(num_epochs):
            # Randomly select timesteps to train on.
            indices = np.random.randint(len(batch['obs']), size=mini_batch_size)
            
            a_debug = self.actor.train( \
                          # Batched episode information.
                          [batch['act'][i] for i in indices],
                          [batch['obs'][i] for i in indices],
                          [batch['advantage'][i] for i in indices],
                       
                          # Hyperparameters.
                          a_rate, e_encr, min_max_w,
                                    
                          verbose)
            
            c_debug = self.critic.update( \
                          # Batched episode information.
                          [batch['obs'][i] for i in indices],
                          [batch['td_target'][i] for i in indices],
                          
                          # Hyperparameters.
                          c_rate,
                          
                          verbose)
            
            if verbose:
                a_batch_loss.append(a_debug[0])
                c_batch_loss.append(c_debug[0])
            
        if verbose:
            print('Actor Batch Loss: {:14.7f}'.format(np.mean(a_batch_loss)), end='; ')
            print('Critic Batch Loss: {:14.7f}'.format(np.mean(a_batch_loss)), end='; ')
            print('Batch Reward: {:14.7f}'.format(batch['avg_rew']))
 
    def train_all(self,

            # Default batch-training parameters.
            batch, a_rate, c_rate, e_encr, min_max_w, verbose
        ):
        
        """
        Trains both networks on the entirety of a batch of episode information.
        """
        
        a_debug = self.actor.train( \
                      # Batched episode information.
                      batch['act'],
                      batch['obs'],
                      batch['advantage'],
                      
                      # Hyperparameters.
                      a_rate, e_encr, min_max_w,
                      
                      verbose)
        
        c_debug = self.critic.update( \
                      # Batched episode information.
                      batch['obs'],
                      batch['td_target'],
                      
                      # Hyperparameters.
                      c_rate,
                      
                      verbose)
        
        if verbose:
            print('Actor Batch Loss: {:14.7f}'.format(a_debug[0]), end='; ')
            print('Critic Batch Loss: {:14.7f}'.format(c_debug[0]), end='; ')
            print('Batch Reward: {:14.7f}'.format(batch['avg_rew']))
    
    def compute_advantage(self, obs, rewards, decay, norm):
        # Discount and maybe normalize rewards.
        disc_rewards = process_rewards(rewards, decay, norm)

        policy_target = np.zeros_like(disc_rewards)
        value_target = np.zeros_like(disc_rewards)
        running_reward = 0

        for idx in range(len(disc_rewards)):
            # Critic estimates processed value of current state.
            estimate = self.critic.predict(obs[idx])
            
            # Target for critic is actual processed value.
            td_target = disc_rewards[idx]
            value_target[idx] = td_target
            
            # Advantage for actor is error in critic's estimation.
            td_error = td_target - estimate
            policy_target[idx] = td_error
        
        return policy_target.tolist(), value_target.tolist()    

    def save(self):
        self.saver.save(self.sess, self.path)
        
    def restore(self):
        self.saver.restore(self.sess, self.path)
        
    def play(self, verbose=False):
        """
        Runs a single instance of the game without training or storing training
        information. Always displays the game and closes the window afterward.
        """
        
        obs_curr = self.env.reset()
        done = False
        
        if verbose: rewards = 0
        
        while not done:
            self.env.render()

            # Actor chooses action.
            action = self.actor.choose_action(obs_curr, verbose=verbose)

            # Actor takes action in environment.
            obs_curr, reward, done, _ = self.env.step(action)
            
            if verbose: rewards += reward
            
        if verbose: print('Episode Reward: {:14.7f}'.format(rewards))
        self.env.close()
            
    def rollout(self, count, render, decay, norm):
        # Holds information about entire rollout of episodes.
        batch = {'act': [], 'obs': [], 'rew': [], 'advantage':[], 'td_target':[]}
        
        rewards = 0
        
        for episode in range(count):
            # Episode batch for current episode to be appended to rollout batch.
            history = {'act': [], 'obs': [], 'rew': [], 'advantage':[], 'td_target':[]}
            
            obs_curr = env.reset()
            done = False

            while not done:
                if render: self.env.render()
                
                # Actor chooses action.
                action = self.actor.choose_action(obs_curr, False)
        
                # Take action in environment.
                next_obs, reward, done, _ = self.env.step(action)
                
                history['act'].append(action)
                history['obs'].append(obs_curr)
                history['rew'].append(reward)
                
                rewards += reward
                
                obs_curr = next_obs

            ( # Preprocess episode information for training.
                history['advantage'],
                history['td_target']
            ) = self.compute_advantage(history['obs'] + obs_curr, history['rew'], decay, norm)
            
            # Add episode to batch.
            for key in batch:
                batch[key].extend(history[key])
                
        # Include average reward of rollout batch.
        batch['avg_rew'] = rewards / count
        
        return batch

In [8]:
tf.reset_default_graph()
env = gym.make('LunarLander-v2')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [9]:
sess = tf.Session()

actor = Actor(sess)
critic = Critic(sess)

In [10]:
handler = Handler(actor, critic, env, sess, '.models/l1.cpt')

In [11]:
handler.init_vars()

In [12]:
while(True):
    for _ in range(100):
        handler.run('train_all', rollout=40, a_rate=0.001, c_rate=0.005, decay=0.99, render=False, verbose=True)
        handler.play()
    print('Completed 100 Training Iterations\n')
    handler.save()

KeyboardInterrupt: 

In [None]:
while(True):
    for _ in range(100):
        handler.run('train_rsample', verbose=True)
    print('\nCompleted 100 Training Iterations\n')
    handler.save()

In [None]:
handler.run('train_constant', episodes=200, decay=0.98, a_rate=0.002, c_rate=0.01, e_encr=0.005, min_max_w=1, verbose=True, render=False)

Actor Episode Loss:      4.7153816; Critic Episode Loss:     96.9322586; Episode Reward:    -32.3958694
Actor Episode Loss:    -12.0564423; Critic Episode Loss:    253.6167450; Episode Reward:   -223.9823998
Actor Episode Loss:    -17.2791958; Critic Episode Loss:     78.2390747; Episode Reward:   -296.8419335
Actor Episode Loss:    -16.1930866; Critic Episode Loss:    127.4874725; Episode Reward:   -245.9365018
Actor Episode Loss:    -48.4692917; Critic Episode Loss:    157.5482025; Episode Reward:   -419.2858078
Actor Episode Loss:    -11.7122879; Critic Episode Loss:     60.8506737; Episode Reward:   -286.5809250
Actor Episode Loss:     -3.6929219; Critic Episode Loss:     63.8653831; Episode Reward:   -263.5226889
Actor Episode Loss:    -88.1345520; Critic Episode Loss:    112.0604782; Episode Reward:   -268.3185656
Actor Episode Loss:   -273.0624695; Critic Episode Loss:   2933.2211914; Episode Reward:   -415.7032026
Actor Episode Loss:    -15.4538879; Critic Episode Loss:    450.

In [None]:
handler.save()

In [None]:
handler.restore()

In [None]:
while True: handler.play(verbose=True)