### Done


### ToDo
* break training up in smaller batches!

# CartPole with Policy Gradient

In [None]:
%%javascript

Jupyter.keyboard_manager.command_shortcuts.add_shortcut('r', {
    help : 'run all cells',
    help_index : 'zz',

    handler : function (event) {
        IPython.notebook.execute_all_cells();
        return false;
    }
});

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Standard libraries

# 3rd party libraries
%matplotlib inline
import matplotlib.pyplot as plt
import time
import tensorflow as tf
import numpy as np
import gym

# Custom libraries
import CartPole_config as config
import utils
import networks
import Logger

# Library setup
tf.set_random_seed(int(time.time()))


In [None]:
# Settings and parameters
experiment_name = None
# experiment_name = '2017-07-09-(10-16-55)'

LEARNING_RATE = 1e-3

In [None]:
## Derrived settings
experiment_name = experiment_name or utils.time_str()
logdir = './logdir/'+config.env_name+'/PG/' + experiment_name
print('logdir: ', logdir)


In [None]:
%%time
from tensorflow.contrib.keras.api.keras.layers import Dense, Input
from tensorflow.contrib.keras.api.keras.models import Model


In [None]:
frame = 0
episode = 0


In [None]:
## Create model
tf.reset_default_graph()
max_train_frame = 2e6

class PolicyGradient:
    def __init__(self, render=False):
        self.sess = tf.Session()
        self.train_interval = 1 # episodes
        self.annealer = utils.Annealer(LEARNING_RATE, 0, max_train_frame)
        
        self.env = gym.make(config.env_name)
        self.render = render
        self.should_stop = False
        
        self.obsPH = tf.placeholder(tf.float32, shape=[None]+[config.num_state], name='obsPlaceholder')
        self.actionPH = tf.placeholder(tf.int32, shape=[None], name='actionPlaceholder')
        self.advantagePH = tf.placeholder(tf.float32, shape=[None], name='advantagePlaceholder')
        self.learningRatePH = tf.placeholder(tf.float32, shape=[], name='learningratePlaceholder')

        self.model = self._build_model()
        self.graph = self._build_graph(self.learningRatePH)

        self.saver = tf.train.Saver(max_to_keep=5)
        self.summary_writer = tf.summary.FileWriter(logdir, self.sess.graph)
        self.logger = Logger.Logger(logdir)
        self.logger.writer = self.summary_writer
        self.sess.run(tf.global_variables_initializer())
        
    def load_model(self, path):
        ckpt = tf.train.get_checkpoint_state(path)
        self.saver.restore(self.sess, ckpt.model_checkpoint_path)
    
    def save_model(self, path):
        self.saver.save(self.sess, path)

    def _build_model(self):
        input_layer = Input(tensor=self.obsPH)
        model_layers = networks.build_dense(input_layer, config.layers, name_stem='dense_')
        model = Model(inputs=input_layer, outputs=model_layers)
        return model

    def _build_graph(self, learning_rate):
        class PGGraph: pass
        graph = PGGraph

        action_hot = tf.one_hot(self.actionPH, config.num_action)
        with tf.variable_scope('actor'):
            logits = Dense(config.num_action, activation='linear')(self.model.output)
            graph.action_probs = tf.nn.softmax(logits)
            graph.action_prob = tf.reduce_sum(graph.action_probs * action_hot,
                                    axis=1, keep_dims=True)

        with tf.variable_scope('training'):
            graph.loss_policy = tf.nn.softmax_cross_entropy_with_logits(
                labels=action_hot, logits=logits)
            graph.loss_policy = tf.reduce_mean(graph.loss_policy * self.advantagePH)
            
            graph.loss_entropy = config.loss_entropy_coef * tf.reduce_mean(
                graph.action_probs * tf.log(graph.action_probs + config.eps))

            graph.loss_total = graph.loss_policy + graph.loss_entropy
            
            optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.99)
            grads_and_vars = optimizer.compute_gradients(graph.loss_total)
            grads, variables = zip(*grads_and_vars)
#             clipped_gradients, _ = zip(*[(tf.clip_by_value(grad, -1., 1.), var)
#                              for grad, var in grads_and_vars])
            ## WARNING: The output from clip_by_value might be totally wrong!!!
            clipped_gradients, _ = (tf.clip_by_global_norm(grads, 1.))
    
#             grad_check = tf.check_numerics(clipped_gradients, 'check_numerics caught bad numerics')
#             try:
#                 with tf.control_dependencies([grad_check]):
            graph.train_op = optimizer.apply_gradients(zip(clipped_gradients, variables))
#             except InvalidArgument:
#                 print('Bad gradients!')
        
        ## Create summaries
        tf.summary.scalar('training/loss_total', graph.loss_total)
        tf.summary.scalar('training/loss_policy', graph.loss_policy)
        tf.summary.scalar('training/loss_entropy', graph.loss_entropy)

        for g, v in grads_and_vars:
            if g is not None:
                tf.summary.histogram('grad_org/'+v.name[:-2], g)
                tf.summary.histogram('var/'+v.name[:-2], g)
        for g, v in zip(clipped_gradients, variables):
            if g is not None:
                tf.summary.histogram('grad_clip/'+v.name[:-2], g)

        graph.summary = tf.summary.merge_all()
        return graph
            
            
    def stop(self):
        self.should_stop = True

    def get_action(self, obs):
        """ Takes a single obs, and returns a single action"""
        obs = [obs]
        [p] = self.sess.run(
                self.graph.action_probs,
                feed_dict={self.obsPH : obs})
        a = np.random.choice(config.num_action, p=p)
        return a
    
    def run(self, load_model=False):
        
        try:
            if load_model:
                self.load_model(logdir)
        except:
            print("Could not find model to load.")
        
        done = False
        obs = self.env.reset()
        experience = [[], [], []]
        rewards = []
        global frame, episode
        try:
            while self.should_stop is False:
                frame += 1
                
                action = self.get_action(obs)
                obs, reward, done, _ = self.env.step(action)
                if self.render: self.env.render()
                
                # add experience to memory
                rewards.append(reward)
                experience[0].append(obs)
                experience[1].append(action)
                
                if done:
                    self.logger.log_scalar(tag='performance/reward', 
                                           value=sum(rewards),
                                           step=frame)

                    episode += 1                    
                    if sum(rewards) >= config.env_max_step: # if we win make the advantage positive for all
#                         print('sum(rewards) =', sum(rewards))
                        dis_r = 0.01 * np.ones_like(rewards)
                        dis_r = list(dis_r)
                    else: # compute discounted rewards
#                         print('Normal')
                        dis_r = utils.discount_rewards(rewards, config.gamma)
                        dis_r = list(dis_r)
                        
#                     print('dis_r', type(dis_r), len(dis_r), type(dis_r[9]))
#                     print(dis_r)
#                     break
                    experience[2] += dis_r

                    rewards = []                
                    done = False
                    obs = self.env.reset()
                
                    if episode % self.train_interval == 0:
                        assert len(experience[0]) == len(experience[1]), \
                            "Error: experience lenghts don't allign" + str([len(i) for i in experience])
                        assert len(experience[0]) == len(experience[2]), \
                            "Error: experience lenghts don't allign" + str([len(i) for i in experience])
                        self.logger.log_scalar(tag='training/batch_size', 
                                           value=len(experience[0]),
                                           step=frame)

                            
                        # stack experience
                        obs_stack = np.vstack(experience[0])
                        action_stack = np.vstack(experience[1])
                        action_stack = np.squeeze(action_stack)
                        reward_stack = np.vstack(experience[2])
                        reward_stack = np.squeeze(reward_stack)
                        # normalize discounted rewardrrrr
                        
                        reward_std = np.std(reward_stack)
                        if np.abs(reward_std) > 1e6:
                            reward_stack = (reward_stack - np.mean(reward_stack))/reward_std
                        else:
                            reward_stack = reward_stack - np.mean(reward_stack)
#                         plt.plot(reward_stack)
#                         break
                        experience = [[], [], []]

#                         print('obs_stack', obs_stack.shape)
#                         print('action_stack', action_stack.shape)
#                         print('reward_stack', reward_stack.shape)
                        

                        _, summary = self.sess.run(
                                [self.graph.train_op, self.graph.summary], 
                                feed_dict={self.obsPH : obs_stack,
                                           self.actionPH : action_stack,
                                           self.advantagePH : reward_stack,
                                           self.learningRatePH : self.annealer.linear(frame)})
                        self.logger.log_scalar('training/learning_rate', self.annealer.linear(frame), frame)
                        self.summary_writer.add_summary(summary, frame)
                        
                        if episode % 500 == 0:
                            print(frame, 'model saved', logdir)
                            self.save_model(logdir + '/model_'+str(frame))
                        
                        if frame > max_train_frame:
                            print('max_train_frame reached')
                            self.should_stop = True
                
#                         if frame > 5e5:
#                             self.train_interval = 3
                    
        except KeyboardInterrupt:
            print('KeyboardInterrupt')
        print('Training ended')
        agent.env.render(close=True)

agent = PolicyGradient(render=False)
agent.run(load_model=True)
print('done')

In [None]:
## Slow episode run
from IPython.display import clear_output


# setup
obs = agent.env.reset()
done = False
reward_sum = 0
action_prob = [] # probability of goint left
action_chosen = []
# values = []
# v_max = 0
t_max = 50
try:
    while True:    
        [p] = agent.sess.run(
                agent.graph.action_probs,
                feed_dict={agent.obsPH : [obs]})
#         print(p, type(p))
        action_prob.append(p[1])
#         values.append(v)
#         v_max = 1.25*v if v > v_max else v_max
        a = np.random.choice(config.num_action, p=p)
        action_chosen.append(a)
        obs, reward, done, _ = agent.env.step(a)
        reward_sum += int(reward)
        img = agent.env.render(mode='rgb_array')

        
        if t_max - reward_sum < 10:
            t_max += 50
        fig, ax = plt.subplots(2,1)
        ax[0].imshow(img)
        ax[0].axis('off')
        ax[0].set_title(experiment_name + '\nt = {:5d}, frames = {:7}'.format(reward_sum, frame))

        ax[1].set_title('Action, 1 = right')
        ax[1].plot([0, t_max],[0.5, 0.5],'k',alpha=0.5)
        ax[1].plot(action_prob)
        ax[1].plot(action_chosen, 'bo', markeredgewidth=0.0, markersize=4, alpha=0.25)
        ax[1].set_xlim([0,t_max])
        ax[1].set_ylim([-0.1, 1.1])

        if done:
#             ax[1,0].plot(ideal_value(np.ones(reward_sum), config.gamma), c='g', alpha=0.75, label='Discounted Reward')
#             ax[1,0].legend()
#             plt.savefig('tmp/training_graphs' + utils.time_str() + '.png', bbox_inches='tight')
            obs = agent.env.reset()
            done = False
            reward_sum = 0
            action_prob = [] # probability of goint left
            action_chosen = []
#             values = []
#             v_max = 0
            t_max = 50
            
        clear_output(wait=True)
        plt.show()


except KeyboardInterrupt:
    print('KeyboardInterrupt')


print('Terminated', reward_sum)

In [None]:
1