In [1]:
from keras.layers import Dense, Activation
from keras.models import Sequential, load_model
from keras.optimizers import Adam
import numpy as np

Using TensorFlow backend.


In [2]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions, discrete=False):
        self.mem_size = max_size
        
        #rather than deque, use set np arrays, track index last saved
        #store tuple of reward, q-vals, next state for experience replay
        self.mem_counter = 0 
        self.discrete = discrete
        
        #allocate table equal to rows of mem entries with cols of env observations (e.g. for lunar lander-> 8 possibile results)
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8 if self.discrete else np.float32 #for continuous actions: decimals; discrete space: int w/ 8 possibilities
        
        #set dtype to index np array for experience replay, table will store either int or decimals depending on dtype 
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype) #possible actions for lunar lander-> 4
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        self.reward_memory = np.zeros(self.mem_size)
        #sampling transitions for eps, future reward at terminal state is zero, must accomodate and store incase
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)
        
    #add transitions to mem    
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_counter % self.mem_size #ensure mem overwritten when mem_size surpassed 
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        if self.discrete:
            #retrieve num actions from cols of action_mem if discrete space
            actions = np.zeros(self.action_memory.shape[1])
            #provide one-hot encoding for selected action
            #ex: [0, 0, 0, 1, 0, 0, 0, 0] -> at state x, agent takes actions[3] = 1, goes to state_ y
            actions[action] = 1.0
            #store entire arr of actions at each index in mem
            self.action_memory[index] = actions
        else:
            self.action_memory[action] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - int(done)
        self.mem_counter += 1
    
    #define sample size for mem; prefer to not have sequential observation samples, else correlations 
    #agent will inaccurately prioritise certain state-action pairs
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_counter, self.mem_size)
        #select batch_size entries from range: [0, max_mem]
        batch = np.random.choice(max_mem, batch_size) 
        states = self.action_memory[batch]
        new_states = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        terminal = self.terminal_memory[batch]
        
        return states, actions, rewards, new_states, terminal
    

In [1]:
def build_dqn(learning_rate, n_actions, input_dims, fc1_dims, fc2_dims):
    #input shape has empty placeholder; implies batch provided 
    model = Sequential([Dense(fc1_dims, input_shape(input_dims, )),
                        Activation('relu'),
                        Dense(fc2_dims),
                        Activation('relu'),
                        Dense(n_actions)])
    modile.compile(optimizer=Adam(lr=learning_rate, loss='mse'))
    return model

In [None]:
class DDQNAction(object):
    #epsilon continues decreasing over time to ensure less random actions taken as optimal path will be found 
    #replace target net weights every 100 eps to update for action eval
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size, input_dims, 
                 epsilon_dec=0.996, epsilon_end=0.01, mem_size=1000000, fname='ddqn_model.h5',
                replace_target=100):
        self.action_space = [i for i in range(self.n_actions)]
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size
        self.model = fname
        self.replace_target = replace_target
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions, True)
        self.q_eval = build_dqn(alpha, n_actions, input_dims, 256, 256)
        self.q_target = build_dqn(alpha, n_actions, input_dims, 256, 256)
        
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def choose_action(self, state):
        #insert axis along first-dim (row), ensures single-mem entries can be handled in addition to batches for NN input
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            #predict defined by keras, runs feed-fwd to compute output (vec of action q-vals)
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)
        return action