# Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
import tensorflow.contrib.layers as layers
from tensorflow.contrib.layers import fully_connected as fc
from tensorflow.contrib.layers import xavier_initializer as xavier

import gym
import gym.spaces

### REINFORCE algorithm

In [None]:
env = gym.make("CartPole-v0")
# set some random seed for reproducibility
env.seed(1)

In [None]:
# hyperparameters

state_shape = env.observation_space.shape
num_actions = env.action_space.n

In [None]:
def fc_module(input_layer, hiddens, activation_fn=tf.nn.relu):
    """ fully connected module
        hiddens: list of form [n_1, n_2, ..., n_k] where
        n_i is the number of neurons on i_th hidden layer
    """
    out = input_layer
    for num_outputs in hiddens:
        out = fc(
            out,
            num_outputs=num_outputs,
            activation_fn=activation_fn,
            weights_initializer=xavier())
    return out


class Policy:
    
    def __init__(
            self,
            state_shape,
            num_actions,
            hiddens=[16, 16],
            learning_rate=1e-3,
            scope="policy"):

        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):

            # Define placeholders for states, actions and targets
            self.states_ph = tf.placeholder(
                dtype=tf.float32, shape=(None,)+state_shape)
            self.actions_ph = tf.placeholder(
                dtype=tf.int32, shape=(None, 1))
            self.targets_ph = tf.placeholder(
                dtype=tf.float32, shape=(None, 1))

            # Construct network graph
            out = fc_module(self.states_ph, hiddens)
            self.logits = fc_module(out, [num_actions], activation_fn=None)
            self.probs = tf.squeeze(tf.nn.softmax(self.logits))

            # Construct loss as softmax cross entropy with logits
            actions_onehot = tf.one_hot(self.actions_ph, depth=num_actions)
            neg_likelihoods = tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=actions_onehot, logits=self.logits)
            self.loss = tf.reduce_sum(
                tf.multiply(neg_likelihoods, self.targets_ph))

            # Set optimizer and training operation
            self.optimizer = tf.train.AdamOptimizer(learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

    def __call__(self, sess, states):
        """ estimate vector of action probabilities (policy)
            for a given batch of states
        """
        return sess.run(self.probs, {self.states_ph: states})
    
    def train(self, sess, states, actions, targets):
        feed_dict = {
            self.states_ph: states,
            self.actions_ph: actions,
            self.targets_ph: targets}
        loss, _ = sess.run([self.loss, self.train_op], feed_dict)
        return loss

In [None]:
tf.reset_default_graph()
sess = tf.Session()
p = Policy(state_shape, num_actions, hiddens=[128, 128], learning_rate=1e-3)
sess.run(tf.global_variables_initializer())

In [None]:
policy = lambda state: p(sess, state[None, :])

In [None]:
def generate_batch(env, policy, num_episodes=10, gamma=0.97):
    """ generate batch of transitions for training
    """
    
    states = []
    actions = []
    targets = []
    
    for i in range(num_episodes):
        s = env.reset()
        done = False
        rewards = []
        while not done:
            probs = policy(s)
            a = np.random.choice(num_actions, p=policy(s))
            s_, r, done, _ = env.step(a)
            states.append(s)
            actions.append(a)
            rewards.append(r)
            s = s_
        total_reward = sum(rewards)
        for i in range(1, len(rewards)):
            rewards[i] = rewards[i] + gamma * rewards[i-1]
        targets += rewards[::-1]
    
    states = np.array(states)
    actions = np.array(actions)[:, None]
    targets = np.array(targets)[:, None]
    
    return states, actions, targets, total_reward

In [None]:
batch_size = 1
avg_reward = 0
for ep in range(1000):
    states, actions, targets, total_reward = generate_batch(env, policy, batch_size, gamma=0.99)
    avg_reward += total_reward
    loss = p.train(sess, states, actions, targets)
    if ep % 50 == 0:
        print ("Number of episodes:", ep)
        print ("Average reward over last 50 episodes:", avg_reward/50)
        print ("--------------------------------------------")
        avg_reward = 0

### Test trained policy

In [None]:
def test_policy(env, policy, num_episodes):
    total_reward = 0
    for ep in range(num_episodes):
        s = env.reset()
        for i in range(500):
            a = np.random.choice(num_actions, p=policy(s))
            s_, r, done, _ = env.step(a)
            total_reward += r
            s = s_
            if done:
                break
    avg_reward = total_reward / num_episodes
    print ("Average reward over {} episodes is {}".format(num_episodes, avg_reward))

In [None]:
test_policy(env, policy, 100)

### Visualize trained policy

In [None]:
s = env.reset()
for i in range(500):
    a = np.random.choice(num_actions, p=policy(s))
    s_, r, done, _ = env.step(a)
    s = s_
    env.render()
    if done:
        break
print (i)