In [1]:
import gym
import keras
import numpy as np

Using TensorFlow backend.


In [2]:
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float32)

def run_episode(env,agent, n_episodes=1, ep = 0.05):
    '''
    A pure run_episode with an agent. Does nothing else.
    Outputs 3 lists with one entry for every step: state, action and reward
    '''
    state, action, reward, value = [], [], [], []
    episode_number = 0
    observation = env.reset()
    prev_x = None # used in computing the difference frame

    while True:
        # Process observation
        cur_x = prepro(observation)
        x = cur_x - prev_x if prev_x is not None else np.zeros(80*80)
        x = x.reshape((1, -1))
        prev_x = cur_x

        # Execute a step:
        v, a = agent(x,ep=ep)
        observation, r, done, info = env.step(a)

        # Store variables from run
        state.append(x) # note that this is the state before the action was taken
        action.append(a)
        reward.append(r)
        value.append(v)

        if done:
            episode_number += 1
            observation = env.reset() # reset env
            prev_x = None

            if episode_number >= n_episodes:
                
                reward = np.vstack(reward)
                action = np.vstack(action)
                state = np.vstack(state)
                value = np.vstack(value)
                return state,action,reward, value


# Model

In [3]:
from keras.models import Sequential, Input, Model
from keras.layers import Dense, Activation

inp = Input(shape=(80*80,))
h = Dense(200, activation='relu')(inp)
V = Dense(1,activation="tanh", name = "critic")(h)
pi = Dense(6,activation='softmax',name="policy-output")(h)

model = Model(inputs=inp, 
                  outputs=[V,pi])

import keras.backend as K
from keras.losses import mean_squared_error

def weighted_crossentropy(y_true,y_pred):
    reward_true = y_true[:,0]
    action_true = y_true[:,1:]
    ce = K.categorical_crossentropy(y_pred, action_true)
    wce = ce*reward_true
    print ce
    print wce
    return wce

model.compile(loss=[mean_squared_error, weighted_crossentropy],
               optimizer=keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0))

Tensor("Neg:0", shape=(?,), dtype=float32)
Tensor("mul_3:0", shape=(?,), dtype=float32)


In [4]:
def agent(state, ep = 0.05):
    V, aprob = model.predict(state)
    
    if np.random.uniform() < ep:
        action = env.action_space.sample()
    else:
        action = np.argmax(aprob)
    return V,action

## Play

In [5]:
from keras.utils import to_categorical

env = gym.make("Pong-v0")
running_reward = -20.5
batch = 0
batch_ep = 10 #how many episodes to run per batch
ep = 0.1 # probability of random action
while True:
    state,action,reward, value = run_episode(env, agent, n_episodes=batch_ep, ep = ep)

    # Update model
    V_true = reward + (reward==0.0)*value # Value is reward if done (same as reward!=0), else it is predicted value else
    pi_true_weights = np.hstack([V_true, to_categorical(action)]) # Format truth value to be in accordance to weighted_crossentropy
    totloss, vloss, piloss = model.train_on_batch(state, [V_true, pi_true_weights])
    #print totloss, vloss, piloss
    # Book-keeping
    batch +=1
    reward_sum = np.sum(reward)/batch_ep
    running_reward = running_reward * 0.99 + reward_sum * 0.01
    print 'batch %d: total reward: %f. run mean: %f. value: %f. V-loss: %f. pi-loss: %f.' %\
    (batch, reward_sum, running_reward, np.mean(value), vloss, piloss)

[2017-04-05 12:11:38,720] Making new env: Pong-v0


batch 1: total reward: -20.000000. run mean: -20.495000. value: 0.002332. V-loss: 0.019386. pi-loss: -0.026271.
batch 2: total reward: -20.100000. run mean: -20.491050. value: -0.074268. V-loss: 0.010134. pi-loss: -0.145981.
batch 3: total reward: -20.200000. run mean: -20.488139. value: -0.093988. V-loss: 0.008358. pi-loss: -0.179144.
batch 4: total reward: -20.500000. run mean: -20.488258. value: -0.129062. V-loss: 0.005192. pi-loss: -0.236542.
batch 5: total reward: -19.900000. run mean: -20.482376. value: -0.155695. V-loss: 0.005480. pi-loss: -0.281592.
batch 6: total reward: -19.800000. run mean: -20.475552. value: -0.182572. V-loss: 0.005054. pi-loss: -0.325699.
batch 7: total reward: -19.700000. run mean: -20.467796. value: -0.177014. V-loss: 0.004660. pi-loss: -0.317914.
batch 8: total reward: -20.000000. run mean: -20.463118. value: -0.207326. V-loss: 0.003414. pi-loss: -0.369155.
batch 9: total reward: -20.300000. run mean: -20.461487. value: -0.192339. V-loss: 0.003506. pi-l

KeyboardInterrupt: 