In [2]:
#=======================================#
# Yes, this notebook is over-commented. #
#=======================================#

In [1]:
# Make notebook span entire screen, horizontally.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [26]:
import gym
import numpy as np
import os
import tensorflow as tf

In [65]:
# TODO:
#  -> fix loss funtion
#  -> check reward processing

class PolicyAgent(object):
    def __init__(self):
        self.n_actions = 4
        
        # Build graph (i.e. internal neural network).
        self._build()
        
        # Create environment within which the graph will train.
        self.sess = tf.Session()
        self.saver = tf.train.Saver()
        
        # Evaluate an tf.Operation (returned by global_variables_initializer()) that initializes TRAINABLE VARIABLES.
        self.sess.run(tf.global_variables_initializer())
        
    def _build(self):
        self.obs      = tf.placeholder(tf.float32, (None, 210, 160, 3))
        self.acts     = tf.placeholder(tf.int32, (None,))
        self.rew      = tf.placeholder(tf.float32, (None,))
        self.l_rate   = tf.placeholder(tf.float32)
        self.training = tf.placeholder(tf.bool)
        
        c1 = tf.layers.conv2d(self.obs, 16, (3,3), name='c1')
        m1 = tf.layers.max_pooling2d(c1,[2,2],[2,2], name='m1')
        c2 = tf.layers.conv2d(m1, 32, [3,3], name='c2')
        m2 = tf.layers.max_pooling2d(c2,[2,2],[1,1], name='m2')
        flat = tf.layers.flatten(m2, name='flat')
        
        h1 = tf.layers.dense(flat, 64, activation=tf.nn.relu, kernel_initializer=tf.random_normal_initializer(), name='h1')
        h2 = tf.layers.dense(h1,   64, activation=tf.nn.relu, kernel_initializer=tf.random_normal_initializer(), name='h2')
        
        dropout = tf.layers.dropout(h2, training=self.training, name='dropout')
        
        # This needs to have linear activations to conform to the "unscaled log probabilities" requirement of `sparse_softmax_cross_entropy_loss_with_logits()`.
        out = tf.layers.dense(dropout, self.n_actions, kernel_initializer=tf.random_normal_initializer(), name='out')
        
        # Compute normalized probabilities associated with each action.
        self.probabilities = tf.nn.softmax(out)
        
        # Returns softmax cross entropy loss.
        neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=out, labels=self.acts)
        loss = tf.reduce_mean(neg_log_prob * self.rew)
        
        # Set optimizer.
        self.train_op = tf.train.AdamOptimizer(self.l_rate).minimize(loss)
    
    def save(self, path):
        self.saver.save(self.sess, path)
        
    def load(self, path):
        self.saver.restore(self.sess, path)
    
    def choose_action(self, obs):
        # Compute probabilities associated with each action.
        prob_weights = self.sess.run(self.probabilities, feed_dict={
            self.obs:      np.array(obs).reshape(-1, 210, 160, 3),
            self.training: False
        })
        
        # Choose action based on computed probabilities.
        return np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel())
    
    def train(self, act, obs, rew, l_rate):
        self.sess.run(self.train_op, feed_dict={
            self.obs:      np.array(obs).reshape(-1, 210, 160, 3),
            self.acts:     act,
            self.rew:      rew,
            self.l_rate:   l_rate,
            self.training: True
        })

In [66]:
class AgentHandler(object):
    def __init__(self, agent, env, path='./.model.ckpt'):
        self.agent = agent
        self.env = env
        
        self.saver = tf.train.Saver() # saves all variables
        self.path = path
        
    def run(self, train_func, rollout=100, l_rate=0.001, **kwargs):
        assert isinstance(train_func, str) and train_func.startswith('train_'), \
               'invalid train_func name specified'
        getattr(self, train_func)(self.rollout(rollout), l_rate, **kwargs)
        
    def train_rsample(self, batch, l_rate, num_epochs=30, mini_batch_size=50):
        indices = np.random.randint(len(batch['rew']), size=mini_batch_size)
        self.agent.train([batch['act'][i] for i in indices],
                         [batch['obs'][i] for i in indices],
                         [batch['rew'][i] for i in indices],
                         l_rate)
        
    def process_rewards(self, rewards, decay=0.99):
        discounted = np.zeros_like(rewards)
        running_reward = 0
        
        for idx in reversed(range(len(rewards))):
            running_reward += rewards[idx]
            running_reward *= decay
            discounted[idx] = running_reward
            
        discounted -= np.mean(discounted)
        if np.std(discounted) != 0:
            discounted /= np.std(discounted)
        return discounted.tolist()

    def save(self):
        self.agent.save(self.path)
        
    def load(self):
        self.agent.restore(self.path)
            
    def rollout(self, count, render=True):
        batch = {'act': [], 'obs': [], 'rew': []}
        
        for episode in range(count):
            # Stores all the stuff
            history = {'act': [], 'obs': [], 'rew': []}
            
            # Get observation of initial state of environment.
            obs_prev = self.env.reset()
            
            # Randomly choose first action in order to create first difference frame.
            obs_curr, reward, done, _ = self.env.step(env.action_space.sample())

            while not done:
                # Make difference frame.
                diff_frame, obs_prev = obs_curr - obs_prev, obs_curr
        
                if render: self.env.render()
        
                # Agent chooses action based on difference frame.
                action = self.agent.choose_action(diff_frame)
        
                # Take action in environment.
                curr_obs, reward, done, _ = self.env.step(action)
                
                history['act'].append(action)
                history['obs'].append(curr_obs)
                history['rew'].append(reward)
                
            # Process rewards per episode.
            history['rew'] = self.process_rewards(history['rew'])
            
            # Add episode to batch.
            for key in batch:
                batch[key].extend(history[key])
                
        return batch

In [67]:
tf.reset_default_graph()

In [68]:
env = gym.make('Breakout-v0') # RGB observation space
agent = PolicyAgent()

handler = AgentHandler(agent, env)

In [69]:
handler.run('train_rsample', rollout=3)