In [1]:
from keras.layers import Dense, Activation
from keras.models import Sequential, load_model
from keras.optimizers import Adam
import numpy as np

Using TensorFlow backend.


In [None]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions, discrete=False):
        self.mem_size = max_size
        
        #rather than deque, use set np arrays, track index last saved
        #store tuple of reward, q-vals, next state for experience replay
        self.mem_counter = 0 
        self.discrete = discrete
        
        #allocate table equal to rows of mem entries with cols of env observations (e.g. for lunar lander-> 8 possibile results)
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8 if self.discrete else np.float32 #for continuous actions: decimals; discrete space: int w/ 8 possibilities
        
        #set dtype to index np array for experience replay, table will store either int or decimals depending on dtype 
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype) #possible actions for lunar lander-> 4
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        self.reward_memory = np.zeros(self.mem_size)
        #sampling transitions for eps, future reward at terminal state is zero, must accomodate and store incase
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)
        
    #add transitions to mem    
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_counter % self.mem_size #ensure mem overwritten when mem_size surpassed 
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        if self.discrete:
            #retrieve num actions from cols of action_mem if discrete space
            actions = np.zeros(self.action_memory.shape[1])
            #provide one-hot encoding for selected action
            #ex: [0, 0, 0, 1, 0, 0, 0, 0] -> at state x, agent takes actions[3] = 1, goes to state_ y
            actions[action] = 1.0
            #store entire arr of actions at each index in mem
            self.action_memory[index] = actions
        else:
            self.action_memory[action] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - int(done)
        self.mem_counter += 1
    
    #define sample size for mem; prefer to not have sequential observation samples, else correlations 
    #agent will inaccurately prioritise certain state-action pairs
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_counter, self.mem_size)
        #select batch_size entries from range: [0, max_mem]
        batch = np.random.choice(max_mem, batch_size) 
        states = self.state_memory[batch]
        new_states = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        terminal = self.terminal_memory[batch]
        
        return states, actions, rewards, new_states, terminal
    

In [None]:
def build_dqn(learning_rate, n_actions, input_dims, fc1_dims, fc2_dims):
    #input shape has empty placeholder; implies batch provided 
    model = Sequential([Dense(fc1_dims, input_shape=(input_dims, )),
                        Activation('relu'),
                        Dense(fc2_dims),
                        Activation('relu'),
                        Dense(n_actions)])
    model.compile(optimizer=Adam(lr=learning_rate), loss='mse')
    return model

In [None]:
class DDQNAgent(object):
    #epsilon continues decreasing over time to ensure less random actions taken as optimal path will be found 
    #replace target net weights every 100 eps to update for action eval
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size, input_dims, 
                 epsilon_dec=0.996, epsilon_end=0.01, mem_size=1000000, fname='ddqn_model.h5',
                replace_target=100):
        self.n_actions = n_actions
        self.action_space = [i for i in range(self.n_actions)]
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        self.replace_target = replace_target
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions, discrete=True)
        self.q_eval = build_dqn(alpha, n_actions, input_dims, 256, 256)
        self.q_target = build_dqn(alpha, n_actions, input_dims, 256, 256)
        
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def choose_action(self, state):
        #insert axis along first-dim (row), ensures single-mem entries can be handled in addition to batches for NN input
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            #predict defined by keras, runs feed-fwd to compute output (vec of action q-vals)
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)
        return action
    
    def learn(self):
        #ensure mem > batch so samples have sufficient variance
        if self.memory.mem_counter > self.batch_size:
            state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
            action_values = np.array(self.action_space, dtype=np.int8)
            action_indices = np.dot(action, action_values)
            
            #used to calculate q-val for maximal action selection with behavioral net
            q_next = self.q_target.predict(new_state)
            q_eval = self.q_eval.predict(new_state)
            
            #given optimal action, determine q-val with target net
            q_pred = self.q_eval.predict(state) #pred state-vals & store in table
            
            max_actions = np.argmax(q_eval, axis=1) #use behavioral net, q_eval to find best action 
            q_target = q_pred #evaluate behavioral net, attempt to fit it to optimal policy target net
            
            batch_index = np.arange(self.batch_size, dtype=np.int32)
            
            #apply q_target to find state-vals  & update in table for corresponding actions 
            q_target[batch_index, action_indices] = reward + self.gamma * q_next[batch_index, max_actions.astype(int)] * done
            
            #train behavioral net, tune weights to minimise loss (difference); determine q_pred & take diff using it against q_target
            #difference applied across all state-val pairs of q_eval against optimal state-val pair of q_target
            _ = self.q_eval.fit(state, q_target, verbose=0)
            
            #gradually adjust epislon to improve runtime as optimal actions taken
            self.epsilon = self.epsilon * self.epsilon_dec if self.epsilon > self.epsilon_min else self.epsilon_min
            
            #update params at threshold
            if self.memory.mem_counter % self.replace_target == 0:
                self.update_network_parameters()
        
    #copy weights to target for update 
    def update_network_parameters():
        self.q_target.model.set_weights(self.q_eval.model.get_weights())
            
    def save_model(self):
        self.q_eval.save(self.model_file)
        
    def load_model(self):
        self.q_eval = load_model(self.model_file)
        #if model fully trained, update target net
        if self.epsilon <= self.epsilon_min:
            self.update_network_parameters()

In [3]:
def plotLearning(x, scores, epsilons, filename, lines=None):
    fig=plt.figure()
    ax=fig.add_subplot(111, label="1")
    ax2=fig.add_subplot(111, label="2", frame_on=False)

    ax.plot(x, epsilons, color="C0")
    ax.set_xlabel("Game", color="C0")
    ax.set_ylabel("Epsilon", color="C0")
    ax.tick_params(axis='x', colors="C0")
    ax.tick_params(axis='y', colors="C0")

    N = len(scores)
    running_avg = np.empty(N)
    for t in range(N):
	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])

    ax2.scatter(x, running_avg, color="C1")
    #ax2.xaxis.tick_top()
    ax2.axes.get_xaxis().set_visible(False)
    ax2.yaxis.tick_right()
    #ax2.set_xlabel('x label 2', color="C1")
    ax2.set_ylabel('Score', color="C1")
    #ax2.xaxis.set_label_position('top')
    ax2.yaxis.set_label_position('right')
    #ax2.tick_params(axis='x', colors="C1")
    ax2.tick_params(axis='y', colors="C1")

    if lines is not None:
        for line in lines:
            plt.axvline(x=line)

    plt.savefig(filename)

In [4]:
import gym

if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    ddqn_agent = DDQNAgent(alpha=0.0005, gamma=0.99, n_actions=4, epsilon=1.0, batch_size=64, input_dims=8)
    n_games = 500
    
    ddqn_scores = []
    eps_history = []
    
    #env = wrappers.Monitor(env, 'tmp/lunar-lander', video_callable=lambda episode_id: True, force=True)
    
    for i in range(n_games):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = ddqn_agent.choose_action(observation)
            observation_, reward, done, info = env.step(action) #state: observation, new_state: observation_
            score += reward
            ddqn_agent.remember(observation, action, reward, observation_, done)
            observation = observation_
            ddqn_agent.learn()
        eps_history.append(ddqn_agent.epsilon)
        ddqn_scores.append(score)
        
        avg_score = np.mean(ddqn_scores[max(0, i-100):(i+1)])
        #print('episode ', i, 'score %.2f' % score, 'average score %.2f', % avg_score)
        print('episode: ', i,'score: %.2f' % score,
              ' average score %.2f' % avg_score)
        
        if i % 10 == 0 and i > 0:
            ddqn_agent.save_model()
    
    filename = 'lunarlander-ddqn.png'
    x = [i + 1 for i in range(n_games)]
    plotLearning(x, ddqn_scores, eps_history, filename)

episode:  0 score: -114.60  average score -114.60




episode:  1 score: -109.20  average score -111.90
episode:  2 score: -123.65  average score -115.82
episode:  3 score: -144.39  average score -122.96
episode:  4 score: -378.90  average score -174.15
episode:  5 score: -288.72  average score -193.24
episode:  6 score: -117.36  average score -182.40
episode:  7 score: -314.41  average score -198.90
episode:  8 score: -225.20  average score -201.82
episode:  9 score: -552.39  average score -236.88
episode:  10 score: -481.56  average score -259.12
episode:  11 score: -276.67  average score -260.59
episode:  12 score: -169.48  average score -253.58
episode:  13 score: -42.38  average score -238.49
episode:  14 score: -331.31  average score -244.68
episode:  15 score: -127.47  average score -237.35
episode:  16 score: -169.68  average score -233.37
episode:  17 score: -153.25  average score -228.92
episode:  18 score: -182.65  average score -226.49
episode:  19 score: -142.60  average score -222.29
episode:  20 score: -161.58  average scor

episode:  165 score: -3.09  average score -14.05
episode:  166 score: -74.62  average score -13.77
episode:  167 score: -304.45  average score -16.54
episode:  168 score: 216.49  average score -13.76
episode:  169 score: -174.83  average score -15.08
episode:  170 score: 160.89  average score -13.37
episode:  171 score: 207.34  average score -10.12
episode:  172 score: 182.79  average score -6.60
episode:  173 score: 234.80  average score -3.83
episode:  174 score: 141.96  average score -0.72
episode:  175 score: 112.55  average score 0.84
episode:  176 score: -18.75  average score 1.60
episode:  177 score: -100.13  average score 1.82
episode:  178 score: 111.18  average score 3.50
episode:  179 score: -9.94  average score 3.04
episode:  180 score: 114.59  average score 4.62
episode:  181 score: -109.75  average score 4.20
episode:  182 score: -80.81  average score 4.07
episode:  183 score: 250.00  average score 6.97
episode:  184 score: 216.53  average score 10.11
episode:  185 score:

episode:  331 score: 294.00  average score 211.67
episode:  332 score: 295.31  average score 212.49
episode:  333 score: 241.41  average score 212.30
episode:  334 score: 251.25  average score 212.45
episode:  335 score: 233.26  average score 212.16
episode:  336 score: 244.64  average score 212.02
episode:  337 score: 235.63  average score 211.86
episode:  338 score: 250.58  average score 212.48
episode:  339 score: 210.88  average score 212.45
episode:  340 score: 218.31  average score 211.86
episode:  341 score: 282.19  average score 213.46
episode:  342 score: 227.87  average score 216.55
episode:  343 score: 232.26  average score 216.06
episode:  344 score: 234.18  average score 215.72
episode:  345 score: 236.96  average score 215.75
episode:  346 score: 259.80  average score 215.91
episode:  347 score: 261.82  average score 215.93
episode:  348 score: 195.81  average score 215.78
episode:  349 score: 255.60  average score 216.01
episode:  350 score: 278.27  average score 216.48


episode:  495 score: 270.76  average score 222.63
episode:  496 score: 252.13  average score 222.32
episode:  497 score: -14.48  average score 219.56
episode:  498 score: 308.67  average score 219.78
episode:  499 score: 265.62  average score 219.71


NameError: name 'plt' is not defined

In [2]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions, discrete=False):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.discrete = discrete
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8 if self.discrete else np.float32
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        # store one hot encoding of actions, if appropriate
        if self.discrete:
            actions = np.zeros(self.action_memory.shape[1])
            actions[action] = 1.0
            self.action_memory[index] = actions
        else:
            self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - done
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, actions, rewards, states_, terminal

def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
    model = Sequential([
                Dense(fc1_dims, input_shape=(input_dims,)),
                Activation('relu'),
                Dense(fc2_dims),
                Activation('relu'),
                Dense(n_actions)])

    model.compile(optimizer=Adam(lr=lr), loss='mse')

    return model

class DDQNAgent(object):
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
                 input_dims, epsilon_dec=0.996,  epsilon_end=0.01,
                 mem_size=1000000, fname='ddqn_model.h5', replace_target=100):
        self.action_space = [i for i in range(n_actions)]
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        self.replace_target = replace_target
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions,
                                   discrete=True)
        self.q_eval = build_dqn(alpha, n_actions, input_dims, 256, 256)
        self.q_target = build_dqn(alpha, n_actions, input_dims, 256, 256)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, state):
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)

        return action

    def learn(self):
        if self.memory.mem_cntr > self.batch_size:
            state, action, reward, new_state, done = \
                                          self.memory.sample_buffer(self.batch_size)

            action_values = np.array(self.action_space, dtype=np.int8)
            action_indices = np.dot(action, action_values)

            q_next = self.q_target.predict(new_state)
            q_eval = self.q_eval.predict(new_state)
            q_pred = self.q_eval.predict(state)

            max_actions = np.argmax(q_eval, axis=1)

            q_target = q_pred

            batch_index = np.arange(self.batch_size, dtype=np.int32)

            q_target[batch_index, action_indices] = reward + \
                    self.gamma*q_next[batch_index, max_actions.astype(int)]*done

            _ = self.q_eval.fit(state, q_target, verbose=0)

            self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
                           self.epsilon_min else self.epsilon_min
            if self.memory.mem_cntr % self.replace_target == 0:
                self.update_network_parameters()

    def update_network_parameters(self):
        self.q_target.model.set_weights(self.q_eval.model.get_weights())

    def save_model(self):
        self.q_eval.save(self.model_file)

    def load_model(self):
        self.q_eval = load_model(self.model_file)
        # if we are in evaluation mode we want to use the best weights for
        # q_target
        if self.epsilon == 0.0:
            self.update_network_parameters()