In [None]:
# OpenGym CartPole-v0 with A3C on GPU
# -----------------------------------
#
# A3C implementation with GPU optimizer threads.
# 
# Made as part of blog series Let's make an A3C, available at
# https://jaromiru.com/2017/02/16/lets-make-an-a3c-theory/
#
# author: Jaromir Janisch, 2017
import datetime

import numpy as np
import tensorflow as tf

import gym, time, random, threading

from keras.models import *
from keras.layers import *
from keras import backend as K

#-- constants
ENV = 'Pong-v0'

RUN_TIME = 60*60*60
THREADS = 8
OPTIMIZERS = 2
THREAD_DELAY = 0.000

GAMMA = 0.99

N_STEP_RETURN = 100
GAMMA_N = GAMMA ** N_STEP_RETURN

EPS_START = 0.4
EPS_STOP  = .05
EPS_STEPS = 750000

MIN_BATCH = 64
LEARNING_RATE = 1e-4

LOSS_V = .5			# v loss coefficient
LOSS_ENTROPY = .01 	# entropy coefficient


## Brain

In [None]:
def conv_model():
    l_input = Input(batch_shape=(None,80,80,1))
    h = Convolution2D(32, (8, 8), activation="relu",strides=(4,4), padding='same')(l_input)
    h = Convolution2D(64, (4, 4), activation="relu",strides=(2,2), padding='same')(h)
    h = Convolution2D(64, (3, 3), activation="relu",strides=(1,1), padding='same')(h)

    flat = Flatten()(h)
    d = Dense(200,activation='relu')(flat)
    out_actions = Dense(NUM_ACTIONS, activation='softmax')(d)
    out_value   = Dense(1, activation='linear')(d)
    
    model = Model(inputs=[l_input], outputs=[out_actions, out_value])
    model._make_predict_function()
    return model

In [None]:
class Brain:
    train_queue = [ [], [], [], [], [] ]	# s, a, r, s', s' terminal mask
    lock_queue = threading.Lock()

    def __init__(self,custom_model=None):
        self.session = tf.Session()
        K.set_session(self.session)
        K.manual_variable_initialization(True)
        
        if custom_model is not None:
            self.model = custom_model()
        else:
            self.model = self._build_model()
        self.graph = self._build_graph(self.model)

        self.session.run(tf.global_variables_initializer())
        self.default_graph = tf.get_default_graph()

        self.default_graph.finalize()	# avoid modifications

    def _build_model(self):
        l_input = Input( batch_shape=(None, NUM_STATE) )
        h = Dense(200, activation='relu')(l_input)

        out_actions = Dense(NUM_ACTIONS, activation='softmax')(h)
        out_value   = Dense(1, activation='linear')(h)

        model = Model(inputs=[l_input], outputs=[out_actions, out_value])
        model._make_predict_function()	# have to initialize before threading

        return model

    def _build_graph(self, model):
        s_t = tf.placeholder(tf.float32, shape=(None, NUM_STATE,NUM_STATE,1))
        a_t = tf.placeholder(tf.float32, shape=(None, NUM_ACTIONS))
        r_t = tf.placeholder(tf.float32, shape=(None, 1)) # not immediate, but discounted n step reward

        p, v = model(s_t)

        log_prob = tf.log( tf.reduce_sum(p * a_t, axis=1, keep_dims=True) + 1e-10)
        advantage = r_t - v

        loss_policy = - log_prob * tf.stop_gradient(advantage)									# maximize policy
        loss_value  = LOSS_V * tf.square(advantage)												# minimize value error
        entropy = LOSS_ENTROPY * tf.reduce_sum(p * tf.log(p + 1e-10), axis=1, keep_dims=True)	# maximize entropy (regularization)

        loss_total = tf.reduce_mean(loss_policy + loss_value + entropy)

        optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, decay=.99)
        minimize = optimizer.minimize(loss_total)

        return s_t, a_t, r_t, minimize

    def optimize(self):
        if len(self.train_queue[0]) < MIN_BATCH:
            time.sleep(0)	# yield
            return

        with self.lock_queue:
            if len(self.train_queue[0]) < MIN_BATCH:	# more thread could have passed without lock
                return 									# we can't yield inside lock

            s, a, r, s_, s_mask = self.train_queue
            self.train_queue = [ [], [], [], [], [] ]

        s = np.stack(s)
        a = np.vstack(a)
        r = np.vstack(r)
        s_ = np.stack(s_)
        s_mask = np.vstack(s_mask)

        if len(s) > 5*MIN_BATCH: print("Optimizer alert! Minimizing batch of %d" % len(s))

        v = self.predict_v(s_)
        r = r + GAMMA_N * v * s_mask	# set v to 0 where s_ is terminal state

        s_t, a_t, r_t, minimize = self.graph
        self.session.run(minimize, feed_dict={s_t: s, a_t: a, r_t: r})
    def save_model(self):
        self.model.save("model.h5")
    def train_push(self, s, a, r, s_):
        with self.lock_queue:
            self.train_queue[0].append(s)
            self.train_queue[1].append(a)
            self.train_queue[2].append(r)

            if s_ is None:
                self.train_queue[3].append(NONE_STATE)
                self.train_queue[4].append(0.)
            else:	
                self.train_queue[3].append(s_)
                self.train_queue[4].append(1.)

    def predict(self, s):
        with self.default_graph.as_default():
            p, v = self.model.predict(s)
            return p, v

    def predict_p(self, s):
        with self.default_graph.as_default():
            p, v = self.model.predict(s)		
            return p

    def predict_v(self, s):
        with self.default_graph.as_default():
            p, v = self.model.predict(s)		
            return v

In [None]:
frames = 0
class Agent:
    def __init__(self, eps_start, eps_end, eps_steps):
        self.eps_start = eps_start
        self.eps_end   = eps_end
        self.eps_steps = eps_steps

        self.memory = []	# used for n_step return
        self.R = 0.

    def getEpsilon(self):
        if(frames >= self.eps_steps):
            return self.eps_end
        else:
            return self.eps_start + frames * (self.eps_end - self.eps_start) / self.eps_steps	# linearly interpolate

    def act(self, s):
        eps = self.getEpsilon()			
        global frames; frames = frames + 1

        if random.random() < eps:
            return random.randint(0, NUM_ACTIONS-1)

        else:
            s = np.array([s])
            p = brain.predict_p(s)[0]

            # a = np.argmax(p)
            a = np.random.choice(NUM_ACTIONS, p=p)

            return a

    def train(self, s, a, r, s_):
        def get_sample(memory, n):
            s, a, _, _  = memory[0]
            _, _, _, s_ = memory[n-1]

            return s, a, self.R, s_

        a_cats = np.zeros(NUM_ACTIONS)	# turn action into one-hot representation
        a_cats[a] = 1 

        self.memory.append( (s, a_cats, r, s_) )

        self.R = ( self.R + r * GAMMA_N ) / GAMMA

        if s_ is None:
            while len(self.memory) > 0:
                n = len(self.memory)
                s, a, r, s_ = get_sample(self.memory, n)
                brain.train_push(s, a, r, s_)

                self.R = ( self.R - self.memory[0][2] ) / GAMMA
                self.memory.pop(0)		

            self.R = 0

        if len(self.memory) >= N_STEP_RETURN:
            s, a, r, s_ = get_sample(self.memory, N_STEP_RETURN)
            brain.train_push(s, a, r, s_)

            self.R = self.R - self.memory[0][2]
            self.memory.pop(0)	

    # possible edge case - if an episode ends in <N steps, the computation is incorrect


In [None]:
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,:1] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float32)#.ravel()


In [None]:
class Environment(threading.Thread):
    stop_signal = False
    
    def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS,print_progress=False):
        threading.Thread.__init__(self)

        self.render = render
        self.env = gym.make(ENV)
        self.print_progress = print_progress
        self.agent = Agent(eps_start, eps_end, eps_steps)
        self.run_reward = -20.5
        self.step = 0

    def runEpisode(self):
        s = self.env.reset()
        s = prepro(s)
        R = 0
        run_reward = 0
        while True:         
            time.sleep(THREAD_DELAY) # yield 

            if self.render: self.env.render()

            a = self.agent.act(s)
            #print(s)
            s_, r, done, info = self.env.step(a)
            s_ = prepro(s_)

            if done: # terminal state
                s_ = None

            self.agent.train(s, a, r, s_)
            
            if not done:
                s = s_ - s
            else:
                s = s_
            R += r

            if done or self.stop_signal:
                if self.print_progress == True:
                    self.step +=1
                    self.run_reward = self.run_reward*0.98 + R*0.02
                    print '%s  step: %d. total reward: %f. run mean: %f.' %(str(datetime.datetime.now()), self.step,R, self.run_reward)
                    
                    if self.step%20 == 0:
                        brain.save_model()
                break

    def run(self):
        while not self.stop_signal:
            self.runEpisode()

    def stop(self):
        self.stop_signal = True

    #---------
class Optimizer(threading.Thread):
    stop_signal = False

    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        while not self.stop_signal:
            brain.optimize()

    def stop(self):
        self.stop_signal = True

In [None]:
#s_, r, done, info = env_test.env.step(1)
#prepro(s_).shape

## Run

In [None]:
#-- main
env_test = Environment(render=False, eps_start=0., eps_end=0., print_progress=True)
NUM_STATE = 80#env_test.env.observation_space.shape[0]
NUM_ACTIONS = env_test.env.action_space.n
NONE_STATE = np.ndarray((NUM_STATE,NUM_STATE,1))*0

brain = Brain(custom_model=conv_model)	# brain is global in A3C

envs = [Environment() for i in range(THREADS)]
# Also run a test environment:
envs.append(env_test)

opts = [Optimizer() for i in range(OPTIMIZERS)]


In [None]:
for o in opts:
	o.start()

for e in envs:
	e.start()

time.sleep(RUN_TIME)

for e in envs:
	e.stop()
for e in envs:
	e.join()

for o in opts:
	o.stop()
for o in opts:
	o.join()

print("Training finished")
env_test.run()