In [3]:
import gym
import keras
import numpy as np

Using TensorFlow backend.


In [14]:
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float32)#.ravel()

def run_episode(env,agent, n_episodes=1, ep = 0.05):
    '''
    A pure run_episode with an agent. Does nothing else.
    Outputs 3 lists with one entry for every step: state, action and reward
    '''
    state, action, reward, value = [], [], [], []
    episode_number = 0
    observation = env.reset()
    prev_x = None # used in computing the difference frame

    while True:
        # Process observation
        cur_x = prepro(observation)
        x = cur_x - prev_x if prev_x is not None else np.zeros(80*80)
        x = x.reshape((1, -1))
        prev_x = cur_x

        # Execute a step:
        v, a = agent(x,ep=ep)
        observation, r, done, info = env.step(a)

        # Store variables from run
        state.append(x) # note that this is the state before the action was taken
        action.append(a)
        reward.append(r)
        value.append(v)

        if done:
            episode_number += 1
            observation = env.reset() # reset env
            prev_x = None

            if episode_number >= n_episodes:
                
                reward = np.vstack(reward)
                action = np.vstack(action)
                state = np.vstack(state)
                value = np.vstack(value)
                return state,action,reward, value


# Model

In [59]:
from keras.models import Sequential, Input, Model
from keras.layers import Dense, Activation

inp = Input(shape=(80*80,))
h = Dense(200, activation='relu')(inp)
V = Dense(1,activation="tanh", name = "critic")(h)
pi = Dense(6,activation='softmax',name="policy-output")(h)

model = Model(inputs=inp, 
                  outputs=[V,pi])

import keras.backend as K
from keras.losses import mean_squared_error

def weighted_crossentropy(y_true,y_pred):
    reward_true = y_true[:,0]
    action_true = y_true[:,1:]
    ce = K.categorical_crossentropy(y_pred, action_true)
    wce = ce*reward_true
    return wce

model.compile(loss=[mean_squared_error, weighted_crossentropy],
               optimizer=keras.optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=1e-08, decay=0.0))

In [60]:
def agent(state, ep = 0.05):
    V, aprob = model.predict(state)
    
    if np.random.uniform() < ep:
        action = env.action_space.sample()
    else:
        #action = np.argmax(aprob)
        action = np.random.choice(6, p=aprob[0])
    return V,action

## Play

In [61]:
from keras.utils import to_categorical

env = gym.make("Pong-v0")
running_reward = -20.5
batch = 0
batch_ep = 3 #how many episodes to run per batch
ep = 0.1 # probability of random action
while True:
    state,action,reward, value = run_episode(env, agent, n_episodes=batch_ep, ep = ep)

    # Update model
    V_true = reward + (reward==0.0)*value # Value is reward if done (same as reward!=0), else it is predicted value else
    pi_true_weights = np.hstack([V_true, to_categorical(action)]) # Format truth value to be in accordance to weighted_crossentropy
    totloss, vloss, piloss = model.train_on_batch(state, [V_true, pi_true_weights])
    #print totloss, vloss, piloss
    # Book-keeping
    batch +=1
    reward_sum = np.sum(reward)/batch_ep
    running_reward = running_reward * 0.99 + reward_sum * 0.01
    print 'batch %d: total reward: %f. run mean: %f. value: %f. V-loss: %f. pi-loss: %f.' %\
    (batch, reward_sum, running_reward, np.mean(value), vloss, piloss)

[2017-04-21 06:03:33,396] Making new env: Pong-v0


batch 1: total reward: -20.666667. run mean: -20.501667. value: -0.005250. V-loss: 0.018649. pi-loss: -0.041914.
batch 2: total reward: -20.333333. run mean: -20.499983. value: -0.582483. V-loss: 0.003276. pi-loss: -0.955680.


KeyboardInterrupt: 

In [58]:
pi_true_weights.shape

(3697, 7)

2

In [38]:
aprob[0][0]

0.13845624