## Pong from Pixels using Keras

This is a replication of Karpathys nice script playing pong from pixels, using keras for the policy network.

https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5

In [1]:
import gym
import keras
import numpy as np

Using TensorFlow backend.


In [2]:
# Hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False
D = 80 * 80 # input dimensionality: 80x80 grid
running_reward = 21.0 # Default start at zero

In [3]:
D = 80 * 80 # input dimensionality: 80x80 grid

def prepro(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(np.float).ravel()

def discount_rewards(r):
  """ take 1D float array of rewards and compute discounted reward """
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(xrange(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r


In [5]:
from keras.models import Sequential
from keras.layers import Dense, Activation

kmodel = Sequential()
kmodel.add(Dense(output_dim=200,input_dim=6400, activation="relu"))
kmodel.add(Dense(1, activation="sigmoid"))



kmodel.compile(loss='binary_crossentropy', 
               optimizer=keras.optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=1e-08, decay=0.0))

## Play

In [6]:
env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None # used in computing the difference frame
xs,hs,y_store,drs = [],[],[],[]

Y , W, X = [],[],[] # to store values used in optimization
running_reward = None
reward_sum = 0
episode_number = 0
while True:
    if render: env.render()

    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob = kmodel.predict(np.vstack(x).transpose()).tolist()[0][0]
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    y = 1 if action == 2 else 0 # a "fake label"
    y_store.append(y)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
        episode_number += 1

        # CALCULATE DISCOUNTED REWARD:
        epr = np.vstack(drs)
        # compute the discounted reward backwards through time
        discounted_epr = discount_rewards(epr)
        # standardize the rewards to be unit normal (helps control the gradient estimator variance)
        discounted_epr -= np.mean(discounted_epr)
        discounted_epr /= np.std(discounted_epr)
        
        # KEEP HISTORY
        W.append(discounted_epr)
        X.append(np.vstack(xs))
        Y.append(np.vstack(y_store))
        xs,y_store,drs = [],[],[] # reset array memory

        
        # boring book-keeping
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print 'ep %d: reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward)
        reward_sum = 0
        observation = env.reset() # reset env
        prev_x = None
        
        # UPDATE PARAMETERS
        if episode_number % batch_size == 0:
            err = kmodel.train_on_batch(np.vstack(X),np.vstack(Y),
                      sample_weight=np.ndarray.flatten(np.concatenate(W)))
            print 'Parameters updated. Training error: ' + str(err)
            Y , W, X = [],[],[] # Delete used variables

[2017-03-14 12:00:29,634] Making new env: Pong-v0


ep 1: reward total was -20.000000. running mean: -20.000000
ep 2: reward total was -21.000000. running mean: -20.010000
ep 3: reward total was -21.000000. running mean: -20.019900
ep 4: reward total was -21.000000. running mean: -20.029701
ep 5: reward total was -21.000000. running mean: -20.039404
ep 6: reward total was -21.000000. running mean: -20.049010
ep 7: reward total was -21.000000. running mean: -20.058520
ep 8: reward total was -20.000000. running mean: -20.057935
ep 9: reward total was -21.000000. running mean: -20.067355
ep 10: reward total was -21.000000. running mean: -20.076682
Parameters updated. Training error: -0.00028408
ep 11: reward total was -21.000000. running mean: -20.085915
ep 12: reward total was -21.000000. running mean: -20.095056
ep 13: reward total was -21.000000. running mean: -20.104105
ep 14: reward total was -21.000000. running mean: -20.113064
ep 15: reward total was -21.000000. running mean: -20.121934
ep 16: reward total was -21.000000. running me

KeyboardInterrupt: 