In [1]:
from keras.layers import Dense, Activation
from keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.regularizers import l2
import numpy as np
import gym
# from utils import plotLearning
from gym import wrappers

import keras.backend as K

In [2]:
class FileLogger():
  def __init__(self, file_name='progress.log'):
    self.file_name = file_name
    self.clean_progress_file()

  def log(self, episode, reward, average_reward):
    f = open(self.file_name, 'a+')
    f.write(f"{episode};{reward};{average_reward}\n")
    f.close()

  def clean_progress_file(self):
    if os.path.exists(self.file_name):
      os.remove(self.file_name)
    f = open(self.file_name, 'a+')
    f.write("episode;reward;average\n")
    f.close()

In [3]:
def masked_huber_loss(mask_value, clip_delta):
    def f(y_true, y_pred):
        error = y_true - y_pred
        cond  = K.abs(error) < clip_delta
        mask_true = K.cast(K.not_equal(y_true, mask_value), K.floatx())
        masked_squared_error = 0.5 * K.square(mask_true * (y_true - y_pred))
        linear_loss  = mask_true * (clip_delta * K.abs(error) - 0.5 * (clip_delta ** 2))
        huber_loss = tf.where(cond, masked_squared_error, linear_loss)
        return K.sum(huber_loss) / K.sum(mask_true)
    f.__name__ = 'masked_huber_loss'
    return f

In [4]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions, discrete=False):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.discrete = discrete
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8 if self.discrete else np.float32
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        # store one hot encoding of actions, if appropriate
        if self.discrete:
            actions = np.zeros(self.action_memory.shape[1])
            actions[action] = 1.0
            self.action_memory[index] = actions
        else:
            self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - done
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, actions, rewards, states_, terminal

def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
    model = Sequential([
                Dense(fc1_dims, input_shape=(input_dims,)),
                Activation('relu'),
                Dense(fc2_dims),
                Activation('relu'),
                Dense(n_actions)])

    model.compile(optimizer=Adam(lr=lr), loss=masked_huber_loss(0,1))

    return model

class Agent(object):
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
                input_dims, epsilon_dec=0.996,  epsilon_end=0.01,
                mem_size=1000000, fname='dqn_model.h5'):
        self.action_space = [i for i in range(n_actions)]
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions, discrete=True)
        self.q_eval = build_dqn(alpha, n_actions, input_dims, 256, 256)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, state):
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)

        return action

    def learn(self):
        if self.memory.mem_cntr > self.batch_size:
            state, action, reward, new_state, done = \
                self.memory.sample_buffer(self.batch_size)

            action_values = np.array(self.action_space, dtype=np.int8)
            action_indices = np.dot(action, action_values)

            q_eval = self.q_eval.predict(state)

            q_next = self.q_eval.predict(new_state)

            q_target = q_eval.copy()

            batch_index = np.arange(self.batch_size, dtype=np.int32)

            q_target[batch_index, action_indices] = reward + \
                self.gamma*np.max(q_next, axis=1)*done

            _ = self.q_eval.fit(state, q_target, verbose=0)

            self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
                self.epsilon_min else self.epsilon_min

    def save_model(self, model_file):
        self.q_eval.save(model_file)

    def load_model(self):
        self.q_eval = load_model(self.model_file)

In [5]:
env = gym.make('LunarLander-v2')
lr = 0.0005
n_games = 500
agent = Agent(gamma=0.99, epsilon=0.0, alpha=lr, input_dims=8,
                n_actions=4, mem_size=1000000, batch_size=64, epsilon_end=0.0)

# agent.load_model()
scores = []
eps_history = []
logger = FileLogger('DQN_History.log')

#env = wrappers.Monitor(env, "tmp/lunar-lander-6",
#                         video_callable=lambda episode_id: True, force=True)

for i in range(n_games):
    done = False
    score = 0
    observation = env.reset()
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.remember(observation, action, reward, observation_, int(done))
        observation = observation_
        agent.learn()

    eps_history.append(agent.epsilon)
    scores.append(score) 

    avg_score = np.mean(scores[max(0, i-100):(i+1)])
    print('episode: ', i,'score: %.2f' % score,
            ' average score %.2f' % avg_score)

    if i % 5 == 0 and i > 0:
        agent.save_model(f'Models/DQN/episode{i}.h5')
    logger.log(i, score, avg_score)


  super(Adam, self).__init__(name, **kwargs)


episode:  0 score: -263.75  average score -263.75
episode:  1 score: -462.09  average score -362.92
episode:  2 score: -107.21  average score -277.68
episode:  3 score: -399.87  average score -308.23
episode:  4 score: -393.01  average score -325.19
episode:  5 score: -220.47  average score -307.73
episode:  6 score: -262.93  average score -301.33
episode:  7 score: -46.14  average score -269.43
episode:  8 score: -173.41  average score -258.77
episode:  9 score: -128.90  average score -245.78
episode:  10 score: 97.87  average score -214.54
episode:  11 score: -229.66  average score -215.80
episode:  12 score: -97.91  average score -206.73
episode:  13 score: -377.40  average score -218.92
episode:  14 score: -164.32  average score -215.28
episode:  15 score: -185.68  average score -213.43
episode:  16 score: -94.54  average score -206.44
episode:  17 score: -195.71  average score -205.84
episode:  18 score: -77.49  average score -199.09
episode:  19 score: -97.40  average score -194.

In [None]:
env = gym.make('LunarLander-v2')
epochs = 500
agent = Agent( alpha=0.0005, gamma=0.99, n_actions=4, epsilon=1, batch_size=64, input_dims=8, epsilon_dec=0.995, epsilon_min=0.01, mem_size=1000000)

# agent.load_model()
scores = []
eps_history = []
logger = FileLogger('DQN_History.log')

#env = wrappers.Monitor(env, "tmp/lunar-lander-6",
#                         video_callable=lambda episode_id: True, force=True)

for i in range(epochs):
    done = False
    score = 0
    observation = env.reset()
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.remember(observation, action, reward, observation_, int(done))
        observation = observation_
        agent.learn()

    eps_history.append(agent.epsilon)
    scores.append(score)

    avg_score = np.mean(scores[max(0, i-100):(i+1)])
    print('episode: ', i,'score: %.2f' % score,
            ' average score %.2f' % avg_score)

    if i % 5 == 0 and i > 0:
        agent.save_model(f'Models/DQN/episode{i}.h5')
    logger.log(i, score, avg_score)

  super(Adam, self).__init__(name, **kwargs)


episode:  0 score: -254.97  average score -254.97
episode:  1 score: -170.89  average score -212.93
episode:  2 score: -318.22  average score -248.02
episode:  3 score: -204.08  average score -237.04
episode:  4 score: -417.39  average score -273.11
episode:  5 score: -201.78  average score -261.22
episode:  6 score: -223.66  average score -255.85
episode:  7 score: -93.87  average score -235.61
episode:  8 score: -92.74  average score -219.73
episode:  9 score: -110.91  average score -208.85
episode:  10 score: -100.28  average score -198.98
episode:  11 score: -74.51  average score -188.61
episode:  12 score: -36.19  average score -176.88
episode:  13 score: -33.53  average score -166.64
episode:  14 score: -36.02  average score -157.94
episode:  15 score: -23.21  average score -149.51
episode:  16 score: -71.28  average score -144.91
episode:  17 score: -118.38  average score -143.44
episode:  18 score: -63.73  average score -139.24
episode:  19 score: -165.44  average score -140.55