# Solves Pong with Policy Gradients in Tensorflow

written October 2016 by Sam Greydanus

inspired by gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt

import numpy as np
import gym
import tensorflow as tf

In [2]:
n_obs = 80 * 80        # dimensionality of observations
h = 200                # number of hidden layer neurons
n_actions = 3          # number of available actions

learning_rate = 5e-4
gamma = .99            # discount factor for reward
decay = 0.992           # decay rate for RMSProp gradients
save_path='Pong-v0/pong.ckpt'

In [3]:
tf_model = {}
with tf.variable_scope('layer_one',reuse=False):
    xavier_l1 = tf.truncated_normal_initializer(mean=0, stddev=1./np.sqrt(n_obs), dtype=tf.float32)
    tf_model['W1'] = tf.get_variable("W1", [n_obs, h], initializer=xavier_l1)
with tf.variable_scope('layer_two',reuse=False):
    xavier_l2 = tf.truncated_normal_initializer(mean=0, stddev=1./np.sqrt(h), dtype=tf.float32)
    tf_model['W2'] = tf.get_variable("W2", [h,n_actions], initializer=xavier_l2)

In [4]:
def tf_discount_rewards(tf_r): #tf_r ~ [game_steps,1]
    discount_f = lambda a, v: a*gamma + v;
    tf_r_reverse = tf.scan(discount_f, tf.reverse(tf_r,[True, False]))
    tf_discounted_r = tf.reverse(tf_r_reverse,[True, False])
    return tf_discounted_r

def tf_policy_forward(x): #x ~ [1,D]
    h = tf.matmul(x, tf_model['W1'])
    h = tf.nn.relu(h)
    logp = tf.matmul(h, tf_model['W2'])
    p = tf.nn.softmax(logp)
    return p

def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

def plt_dynamic(x, y, ax, colors=['b']):
    for color in colors:
        ax.plot(x, y, color)
    fig.canvas.draw()

In [5]:
#placeholders
tf_x = tf.placeholder(dtype=tf.float32, shape=[None, n_obs],name="tf_x")
tf_y = tf.placeholder(dtype=tf.float32, shape=[None, n_actions],name="tf_y")
tf_epr = tf.placeholder(dtype=tf.float32, shape=[None,1], name="tf_epr")

#gradient processing (PG magic)
tf_discounted_epr = tf_discount_rewards(tf_epr)
tf_mean, tf_variance= tf.nn.moments(tf_discounted_epr, [0], shift=None, name="reward_moments")
tf_discounted_epr -= tf_mean
tf_discounted_epr /= tf.sqrt(tf_variance + 1e-6)

# initialize tf graph
tf_aprob = tf_policy_forward(tf_x)
loss = tf.nn.l2_loss(tf_y-tf_aprob)
optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=decay)
tf_grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=tf_discounted_epr)
train_op = optimizer.apply_gradients(tf_grads)

sess = tf.InteractiveSession()
tf.initialize_all_variables().run()

In [6]:
env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None
xs,rs,ys = [],[],[]
running_reward = 2 # usually starts around -20.48 for Pong
reward_sum = 0
episode_number = 0

[2016-10-17 12:28:25,436] Making new env: Pong-v0
[2016-10-17 12:28:25,495] Creating monitor directory /tmp/gym-results
[2016-10-17 12:28:25,547] Starting new video recorder writing to /tmp/gym-results/openaigym.video.0.31815.video000000.mp4


In [7]:
saver = tf.train.Saver(tf.all_variables())
load_was_success = True # yes, I'm being optimistic
try:
    save_dir = '/'.join(save_path.split('/')[:-1])
    ckpt = tf.train.get_checkpoint_state(save_dir)
    load_path = ckpt.model_checkpoint_path
    saver.restore(sess, load_path)
except:
    print "no saved model to load. starting new session"
    load_was_success = False
else:
    print "loaded model: {}".format(load_path)
    saver = tf.train.Saver(tf.all_variables())
    episode_number = int(load_path.split('-')[-1])

loaded model: models/pgpong_nobuff.ckpt-2250


In [8]:
fig,ax = plt.subplots(1,1)
ax.set_xlabel('steps') ; ax.set_ylabel('reward')
ax.set_xlim(1000,5000) ; ax.set_ylim(-1,10)
pxs, pys = [], []

print 'ep {}: starting up...'.format(episode_number)
count = 0
while count < 100:
#     if True: env.render()
        
    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(n_obs)
    prev_x = cur_x

    # stochastically sample a policy from the network
    feed = {tf_x: np.reshape(x, (1,-1))}
    aprob = sess.run(tf_aprob,feed) ; aprob = aprob[0,:]
    action = np.random.choice(n_actions, p=aprob)
    label = np.zeros_like(aprob) ; label[action] = 1

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action+1)
    reward_sum += reward
    
    # record game history
    xs.append(x) ; ys.append(label) ; rs.append(reward)
    
    if done:
        count+=1
        running_reward = running_reward * 0.99 + reward_sum * 0.01
        epx = np.vstack(xs)
        epr = np.vstack(rs)
        epy = np.vstack(ys)
        xs,rs,ys = [],[],[] # reset game history
        
        feed = {tf_x: epx, tf_epr: epr, tf_y: epy}
        _ = sess.run(train_op,feed) # parameter update

        # visualization
        pxs.append(episode_number)
        pys.append(running_reward)
        if episode_number % 10 == 0:
            print 'ep: {}, reward: {}, mean reward: {:3f}'.format(episode_number, reward_sum, running_reward)
            plt_dynamic(pxs, pys, ax)
        else:
            print '\tep: {}, reward: {}'.format(episode_number, reward_sum)
            
        if episode_number % 50 == 0:
            saver.save(sess, save_path, global_step=episode_number)
            print "SAVED MODEL #{}".format(episode_number)
        
        # lame stuff
        episode_number += 1
        observation = env.reset() # reset env
        reward_sum = 0

<IPython.core.display.Javascript object>

ep 2250: starting up...
ep: 2250, reward: 8.0, mean reward: 2.060000


[2016-10-17 12:28:55,519] Starting new video recorder writing to /tmp/gym-results/openaigym.video.0.31815.video000001.mp4


SAVED MODEL #2250
	ep: 2251, reward: 7.0
	ep: 2252, reward: -2.0
	ep: 2253, reward: 11.0
	ep: 2254, reward: 4.0
	ep: 2255, reward: 9.0
	ep: 2256, reward: 7.0


[2016-10-17 12:31:22,099] Starting new video recorder writing to /tmp/gym-results/openaigym.video.0.31815.video000008.mp4


	ep: 2257, reward: 4.0
	ep: 2258, reward: 11.0
	ep: 2259, reward: 7.0
ep: 2260, reward: 14.0, mean reward: 2.557459


[2016-10-17 12:32:35,526] Ending episode 12 because it reached the timestep limit of 10000.


	ep: 2261, reward: -2.0
	ep: 2262, reward: -12.0
	ep: 2263, reward: 8.0
	ep: 2264, reward: 7.0
	ep: 2265, reward: 9.0
	ep: 2266, reward: 3.0
	ep: 2267, reward: -3.0
	ep: 2268, reward: -1.0
	ep: 2269, reward: -4.0
ep: 2270, reward: 3.0, mean reward: 2.390286
	ep: 2271, reward: -4.0
	ep: 2272, reward: -5.0
	ep: 2273, reward: 5.0
	ep: 2274, reward: 2.0
	ep: 2275, reward: -9.0


[2016-10-17 12:37:57,198] Starting new video recorder writing to /tmp/gym-results/openaigym.video.0.31815.video000027.mp4


	ep: 2276, reward: 6.0
	ep: 2277, reward: 2.0
	ep: 2278, reward: 12.0
	ep: 2279, reward: 2.0
ep: 2280, reward: 4.0, mean reward: 2.313351
	ep: 2281, reward: 1.0
	ep: 2282, reward: -7.0


[2016-10-17 12:40:36,887] Ending episode 34 because it reached the timestep limit of 10000.


	ep: 2283, reward: 0.0
	ep: 2284, reward: -4.0
	ep: 2285, reward: 6.0
	ep: 2286, reward: 3.0
	ep: 2287, reward: 1.0
	ep: 2288, reward: -10.0
	ep: 2289, reward: 6.0
ep: 2290, reward: 11.0, mean reward: 2.166008
	ep: 2291, reward: 6.0
	ep: 2292, reward: 3.0


[2016-10-17 12:44:07,992] Ending episode 44 because it reached the timestep limit of 10000.


	ep: 2293, reward: 1.0
	ep: 2294, reward: 9.0
	ep: 2295, reward: 8.0
	ep: 2296, reward: 7.0
	ep: 2297, reward: -2.0
	ep: 2298, reward: 12.0
	ep: 2299, reward: 9.0
ep: 2300, reward: 5.0, mean reward: 2.516073
SAVED MODEL #2300
	ep: 2301, reward: -6.0
	ep: 2302, reward: 8.0


[2016-10-17 12:47:19,077] Ending episode 54 because it reached the timestep limit of 10000.


	ep: 2303, reward: 3.0
	ep: 2304, reward: 7.0
	ep: 2305, reward: 1.0
	ep: 2306, reward: 9.0
	ep: 2307, reward: 4.0
	ep: 2308, reward: 10.0
	ep: 2309, reward: 7.0
ep: 2310, reward: -5.0, mean reward: 2.640451
	ep: 2311, reward: 5.0
	ep: 2312, reward: -9.0


[2016-10-17 12:50:44,667] Starting new video recorder writing to /tmp/gym-results/openaigym.video.0.31815.video000064.mp4


	ep: 2313, reward: 7.0
	ep: 2314, reward: -2.0
	ep: 2315, reward: 4.0
	ep: 2316, reward: -9.0
	ep: 2317, reward: -10.0
	ep: 2318, reward: -1.0
	ep: 2319, reward: -4.0
ep: 2320, reward: 9.0, mean reward: 2.292175
	ep: 2321, reward: -7.0
	ep: 2322, reward: 14.0
	ep: 2323, reward: 5.0
	ep: 2324, reward: 8.0
	ep: 2325, reward: 4.0
	ep: 2326, reward: 7.0
	ep: 2327, reward: 4.0
	ep: 2328, reward: -6.0
	ep: 2329, reward: -3.0
ep: 2330, reward: 13.0, mean reward: 2.445749
	ep: 2331, reward: 5.0
	ep: 2332, reward: 4.0
	ep: 2333, reward: 8.0


[2016-10-17 12:58:10,821] Ending episode 85 because it reached the timestep limit of 10000.


	ep: 2334, reward: 2.0


[2016-10-17 12:58:36,849] Ending episode 86 because it reached the timestep limit of 10000.


	ep: 2335, reward: -1.0
	ep: 2336, reward: 3.0
	ep: 2337, reward: 13.0
	ep: 2338, reward: -2.0
	ep: 2339, reward: 3.0
ep: 2340, reward: 11.0, mean reward: 2.653417
	ep: 2341, reward: 10.0
	ep: 2342, reward: 13.0
	ep: 2343, reward: -2.0
	ep: 2344, reward: 9.0
	ep: 2345, reward: 9.0
	ep: 2346, reward: 1.0
	ep: 2347, reward: -4.0
	ep: 2348, reward: 13.0


[2016-10-17 13:03:08,154] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/gym-results')


	ep: 2349, reward: 5.0
