In [1]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
import gym
import numpy as np

In [2]:
# create env
env = gym.make("CartPole-v0")
obs = env.reset()
# env.render()

[2017-01-26 00:14:05,083] Making new env: CartPole-v0


In [3]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std
            for discounted_rewards in all_discounted_rewards]

In [4]:
n_inputs = 4
n_hidden = 4
n_output = 1
initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = fully_connected(X, n_hidden, activation_fn=tf.nn.elu, 
                         weights_initializer=initializer)
logits = fully_connected(hidden, n_output, activation_fn=None, 
                         weights_initializer=initializer)
outputs = tf.nn.sigmoid(logits)

p_left_right = tf.concat(concat_dim=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_right), num_samples=1)

y = 1. - tf.to_float(action)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits, y)

learning_rate = 0.01
optimizer = tf.train.AdamOptimizer(learning_rate)
# optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, variable in grads_and_vars]

gradient_placeholders = []
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, variable))
    
training_op = optimizer.apply_gradients(grads_and_vars_feed)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
# train

n_iterations = 500      # number of training iterations
n_max_steps = 1000      # max steps per episode
n_games_per_update = 10 # train the policy every 10 episodes
save_iterations = 10    # save the model every 10 training iterations
discount_rate = 0.95

with tf.Session() as sess:
    init.run()
    
    for iteration in range(n_iterations):

        all_rewards = []    # all sequences of raw rewards for each episode
        all_gradients = []  # gradients saved at each step of each episode
        for game in range(n_games_per_update):
            # game
            current_rewards = []   # all raw rewards from the current episode
            current_gradients = [] # all gradients from the current episode
            obs = env.reset()
            for step in range(n_max_steps):
                action_val, gradients_val = sess.run(
                        [action, gradients],
                        feed_dict={X: obs.reshape(1, n_inputs)}) # one obs
                obs, reward, done, info = env.step(action_val[0][0])
                
#                 if iteration % save_iterations == 0:
#                     env.render()
                    
                current_rewards.append(reward)
                current_gradients.append(gradients_val)        
                if done:
                    break

            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)

        # At this point we have run the policy for 10 episodes, and we are
        # ready for a policy update using the algorithm described earlier.
        all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
        feed_dict = {}
        for var_index, grad_placeholder in enumerate(gradient_placeholders):
            # multiply the gradients by the action scores, and compute the mean
            mean_gradients = np.mean(
                [reward * all_gradients[game_index][step][var_index]
                    for game_index, rewards in enumerate(all_rewards)
                    for step, reward in enumerate(rewards)],
                axis=0)
            feed_dict[grad_placeholder] = mean_gradients
        sess.run(training_op, feed_dict=feed_dict)
        if iteration % save_iterations == 0:
            saver.save(sess, 'my_model_delta.ckpt')
    
    save_path = saver.save(sess, "my_model_final_500.ckpt")
    
print 'done: {}'.format(save_path)

In [1]:
# playing

In [5]:
n_max_steps = 1000      # max steps per episode
n_games_per_update = 10 # train the policy every 10 episodes

env.reset()
env.render()
with tf.Session() as sess:
    saver.restore(sess, "/Users/dmitry/workspace/tensorflow_sandbox/notebooks/my_model_final.ckpt")
    print "restored"

    all_rewards = []    # all sequences of raw rewards for each episode
    for game in range(n_games_per_update):
            
        current_rewards = []   # all raw rewards from the current episode    
        obs = env.reset()
        for step in range(n_max_steps):
            action_val = sess.run(
                    action,
                    feed_dict={X: obs.reshape(1, n_inputs)}) # one obs
            obs, reward, done, info = env.step(action_val[0][0])
            env.render()
            current_rewards.append(reward)
            if done:
                break
                
        all_rewards.append(sum(current_rewards))

print "avg: {} min: {} max: {}".format(
    np.average(all_rewards),
    np.min(all_rewards),
    np.max(all_rewards)
)        
        

restored
avg: 185.9 min: 108.0 max: 295.0
