In [3]:
#Notes taken from https://gym.openai.com/docs/

#Lunar Lander
#The Discrete space allows a fixed range of non-negative numbers (env.action_space). 
#In this case valid actions are 4 (left, right, fire engine, do nothing)

#The Box space represents an n-dimensional box, valid observations will be an array of 8 numbers.
#Which are these numbers?  (Ref. https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py)
#(1-2) position in x axis and y axis(height)
#(3-4) x,y axis velocity terms
#(5-6) lander angle and angular velocity
#(7-8) left and right contact points (bool -> 1 in case on contact, 0 otherwise)

#We can also check the Box’s bounds (env.obs_space): from -inf to +inf  [Box(-inf, inf, (8,), float32)]
#print the structure of the observations your environment will be returning. Learning agents usually
#need to know this before they start running, in order to set up the policy function.

#hyperparameters partially taken from https://arxiv.org/pdf/2011.11850.pdf

In [None]:
import gym
import gnwrapper
import random
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
from keras import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque
from keras.activations import relu, linear

env = gnwrapper.LoopAnimation(gym.make('LunarLander-v2'))

print(env.observation_space)  
print(env.action_space) 

#otherwise the agent would always train on similar scenarios
env.seed(0)
np.random.seed(0)

In [None]:
class ddqn:

    def __init__(self, action_space, state_space):
        #hyperparameters
        self.action_space = action_space
        self.state_space = state_space
        self.epsilon = 1.0
        self.epsilon_decay = 0.996                   
        self.epsilon_min = 0.01                      
        self.discount = 0.99
        self.batch_size = 64                         
        self.lr = 0.001                       
        self.memory = deque(maxlen=1000000)
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_freq = 100
        self.counter = 0

        
    def build_model(self):
        model = Sequential()
        model.add(Dense(128, input_dim= 8, activation=relu))
        model.add(Dense(128, activation=relu))
        model.add(Dense(4, activation=linear))
        model.compile(loss='mse', optimizer=Adam(lr=self.lr))
        return model 
    
    #hard update method (every n-step)
    def update_target_model(self):
        self.counter += 1
        if self.counter >= self.update_target_freq:
            print('Updating weights...')
            self.counter = 0
            #overwrite target_model weights with the ones from main model
            self.target_model.set_weights(self.model.get_weights())
        return  
    
    #append the acquired experience into the memory
    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    #based on e-greedy policy
    #if random is smaller than e, take a random action, otherwise returns the index of maximum Q-value
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    #trains the model using randomly selected experiences in the replay memory
    def learn_from_exp(self):
        if len(self.memory) < self.batch_size:
            return

        states = []
        actions = []
        rewards = []
        next_states = []
        dones = []
        
        minibatch = random.sample(self.memory, self.batch_size)        
    
        for i in minibatch:
            states.append(i[0])
            actions.append(i[1])
            rewards.append(i[2])
            next_states.append(i[3])
            dones.append(i[4])

        #reducing the dimension of states/next_states np.arrays
        states = np.squeeze(np.array(states))
        next_states = np.squeeze(np.array(next_states))

        main_pred = self.model.predict(states)
        main_next_pred = self.model.predict(next_states)
        
        target_next_pred = self.target_model.predict(next_states) 
            
        for i in range(self.batch_size):
            #Q value correction based on the taken action
            if dones[i]:
                main_pred[i][actions[i]] = rewards[i]
            else:
                # current Q Network selects the actions
                # target Q Network evaluates the actions
                # max Q -> Q_main_pred(s', argmax_a' Q(s', a'))
                main_pred[i][actions[i]]=rewards[i]+self.discount*(target_next_pred[i][np.argmax(main_next_pred[i])])
                
        #fit -> updating Q-values in main_pred for the actual states (1 iteration)
        self.model.fit(states, main_pred, epochs=1, verbose=0) 
        
        #performing a gradient descent step
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:
agent = ddqn(env.action_space.n, env.observation_space.shape[0])
rewards = []
m_rewards = []

In [None]:
def training(episode):

    global rewards
    global m_rewards
    
    for e in range(episode):
        #environment reset
        state = env.reset()
        state = np.reshape(state, (1, 8))
        #initializing our score, steps
        score = 0
        steps = 0
        while True:
            steps += 1
            #if random is smaller than e, take a random action, otherwise returns the index of maximum Q-value
            action = agent.act(state)
            #from env.step(action), take these 3 values (ignoring "info")
            next_state, reward, done, _ = env.step(action)
            score += reward
            next_state = np.reshape(next_state, (1, 8))
            #append in memory
            agent.memorize(state, action, reward, next_state, done)
            #go to next state
            state = next_state
            #trains the model using randomly selected experiences in the learn memory
            #performs a gradient descend step
            agent.learn_from_exp()
            agent.update_target_model()
            if done:
                print("\nEpisode {}/{} concluded after {} steps. Score: {:.3f}".format(e, episode, steps, score))
                break
        rewards.append(score)
        m_rewards.append(np.mean(rewards[-100:]))

        #Mean reward on last 100 episode
        if m_rewards[-1] > 200:
            print('\n ### Train has finished ###\n\n Final mean reward: {:.3f} \n'.format(m_rewards[-1]))
            break
        print("Mean reward (last 100 episodes): {:.3f} \n".format(m_rewards[-1]))
    return reward

In [None]:
start_training = training(1000)

In [None]:
#plot of the results

plt.plot(rewards, label='Double DQN', linewidth= '1')
plt.plot(m_rewards, label='Average reward', color = 'r', linewidth='3')

#saving the curves
np.savez('rewards_DDQN', rewards)
np.savez('m_rewards_DDQN', m_rewards)

plt.title('Double DQN')
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.grid()
plt.legend(loc = 4)
plt.savefig('Double_DQN.png')
plt.show()

In [None]:
#saving the trained model
agent.model.save('Double_DQN.h5')

In [None]:
#at restore, run this method after executing ln[2-3-4-5].
agent.model = tf.keras.models.load_model('Double_DQN_200.h5')

In [None]:
#evaluation

#uncomment 'env.render()' and 'env.display()' to render the episodes

state = env.reset()
steps = 0
eval_time = 60*15
starting_time = time.time()
stop = lambda : int(time.time() - starting_time) >= eval_time
rewards_eval = []
ep_reward = 0

while True:
    state = np.reshape(state, (1, 8))
    #returns index of action with maximum value
    action = np.argmax(agent.model.predict(state))
    #env.render()
    next_state, reward, done, _ = env.step(action)
    state = next_state
    steps += 1
    ep_reward += reward
    if done:
        rewards_eval.append(ep_reward)
        print("Episode ended in {} time steps".format(steps))
        #env.display()
        steps = 0
        ep_reward = 0
        state = env.reset()
        if stop(): 
            print('Mean reward after the evaluation: {}'.format(np.mean(rewards_eval)))
            break