In [1]:
import gym
import keras
import numpy as np

Using TensorFlow backend.


In [2]:
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float32)

def run_episode(env,agent, n_episodes=1, ep = 0.05):
    '''
    A pure run_episode with an agent. Does nothing else.
    Outputs 3 lists with one entry for every step: state, action and reward
    '''
    state, action, reward, value = [], [], [], []
    episode_number = 0
    observation = env.reset()
    prev_x = None # used in computing the difference frame

    while True:
        # Process observation
        cur_x = prepro(observation)
        x = cur_x - prev_x if prev_x is not None else np.zeros(80*80)
        x = x.reshape((1, -1))
        prev_x = cur_x

        # Execute a step:
        v, a = agent(x,ep=ep)
        observation, r, done, info = env.step(a)

        # Store variables from run
        state.append(x) # note that this is the state before the action was taken
        action.append(a)
        reward.append(r)
        value.append(v)

        if done:
            episode_number += 1
            observation = env.reset() # reset env
            prev_x = None

            if episode_number >= n_episodes:
                
                reward = np.vstack(reward)
                action = np.vstack(action)
                state = np.vstack(state)
                value = np.vstack(value)
                return state,action,reward, value


# Model

In [3]:
from keras.models import Sequential, Input, Model
from keras.layers import Dense, Activation

inp = Input(shape=(80*80,))
h = Dense(200, activation='relu')(inp)
V = Dense(1,activation="tanh", name = "critic")(h)
pi = Dense(6,activation='softmax',name="policy-output")(h)

model = Model(inputs=inp, 
                  outputs=[V,pi])

import keras.backend as K
from keras.losses import mean_squared_error

def weighted_crossentropy(y_true,y_pred):
    reward_true = y_true[:,0]
    action_true = y_true[:,1:]
    ce = K.categorical_crossentropy(y_pred, action_true)
    wce = ce* reward_true
    print ce
    print wce
    return wce

model.compile(loss=[mean_squared_error, weighted_crossentropy],
               optimizer=keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0))

Tensor("Neg:0", shape=(?,), dtype=float32)
Tensor("mul_3:0", shape=(?,), dtype=float32)


In [None]:
def agent(state, ep = 0.05):
    V, aprob = model.predict(state)
    
    if np.random.uniform() < ep:
        action = env.action_space.sample()
    else:
        action = np.argmax(aprob)
    return V,action

## Play

In [None]:
from keras.utils import to_categorical

env = gym.make("Pong-v0")
running_reward = -20.5
batch = 0
batch_ep = 20 #how many episodes to run per batch
ep = 0.1
while True:
    state,action,reward, value = run_episode(env, agent, n_episodes=batch_ep, ep = ep)

    # Update model
    V_true = reward + (reward==0.0)*value # Value is reward if done (same as reward!=0), else it is predicted value else
    pi_true_weights = np.hstack([reward, to_categorical(action)]) # Format truth value to be in accordance to weighted_crossentropy
    totloss, vloss, piloss = model.train_on_batch(state, [V_true, pi_true_weights])
    #print totloss, vloss, piloss
    # Book-keeping
    batch +=1
    reward_sum = np.sum(reward)/batch_ep
    running_reward = running_reward * 0.99 + reward_sum * 0.01
    print 'batch %d: reward total was %f. running mean: %f. value: %f.' % (batch, reward_sum, running_reward, np.mean(value))

[2017-04-05 11:31:10,014] Making new env: Pong-v0


batch 1: reward total was -20.550000. running mean: -20.500500. value: -0.086372.
batch 2: reward total was -21.000000. running mean: -20.505495. value: -0.153853.
batch 3: reward total was -20.750000. running mean: -20.507940. value: -0.246154.
batch 4: reward total was -20.500000. running mean: -20.507861. value: -0.259953.
batch 5: reward total was -20.100000. running mean: -20.503782. value: -0.316833.
batch 6: reward total was -21.000000. running mean: -20.508744. value: -0.262637.
batch 7: reward total was -20.500000. running mean: -20.508657. value: -0.340091.
