# Lunar Landing - Single DQN
Code adapted from:https://github.com/fakemonk1/Reinforcement-Learning-Lunar_Lander

## 1. Import Libraries 

In [None]:
import gym
import numpy as np
import pandas as pd
from collections import deque
import random
from keras import Sequential
from keras.layers import Dense
from keras.activations import relu, linear
from keras.optimizers import Adam
from keras.losses import mean_squared_error as mse
from keras.models import load_model
import pickle
from matplotlib import pyplot as plt
from tqdm import tqdm


## 2. Create a DQN Agent  

In [None]:
class DQN:
    def __init__(self, env, learning_rate, gamma, epsilon, epsilon_decay):
        #set parameters
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.total_step = 0
        self.step_size = 5
        self.rewards_list = []
        self.hidden_dim = 128
        self.replay_memory_buffer = deque(maxlen=1000000)
        self.batch_size = 64
        self.epsilon_min = 0.01
        self.n_action = env.action_space.n
        self.n_observation = env.observation_space.shape[0]
        self.model = self.reset_model()
        #self.target_model = self.reset_model() # create a second network
        self.render = True
    
    #def update_target(self):
        #return self.target_model.set_weights(self.model.get_weights())
    def reset_model(self):
        #create the model
        model = Sequential()
        model.add(Dense(128, input_dim=self.n_observation, activation=relu)) 
        model.add(Dense(self.hidden_dim, activation=relu))
        model.add(Dense(self.n_action, activation=linear))
        model.compile(loss=mse, optimizer=Adam(lr=self.learning_rate)) # compile the model
        return model

    def add_memory(self, state, action, reward, next_state, done):
        self.replay_memory_buffer.append((state, action, reward, next_state, done))
        
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return random.randrange(self.n_action)
        return np.argmax(self.model.predict(state)[0])

    def train(self, num_episodes=1000):
        env = gym.make('LunarLander-v2')
        for episode in tqdm (range(num_episodes)):
            state = env.reset()
            reward_one = 0
            num_steps = 1000
            state = np.reshape(state, [1, self.n_observation])
            for t in range(num_steps):
                if self.render:
                    env.render()
                action = self.get_action(state)
                next_state, reward, done, _ = env.step(action)
                next_state = np.reshape(next_state, [1, self.n_observation])
                # save memory
                self.add_memory(state, action, reward, next_state, done)
                # reward
                reward_one += reward
                state = next_state
                self.update_net()
                self.total_step += 1
                if done:
                    break
            self.rewards_list.append(reward_one)

            # Decay the epsilon after each experience completion
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
            
            #if episode % 10 == 0:
            #self.update_target() # update the network 
                
            # Avg reward
            rewards_mean = np.mean(self.rewards_list[-100:])
            if rewards_mean > 200:
                print("DQN Training Finish...")
                break
            print("\t Episode :", episode, "\t  Reward: ",reward_one, "\t Average Reward: ",rewards_mean)


    def update_net(self):# replay
        # check if training is needed
        if np.mean(self.rewards_list[-10:]) > 180 or (self.total_step % self.step_size) != 0 or len(self.replay_memory_buffer) < self.batch_size:
            return

        sample_data = random.sample(self.replay_memory_buffer, self.batch_size)
        states, actions, rewards, next_states, done = self.reshape_data(sample_data)
        q_target = rewards + self.gamma * (np.amax(self.model.predict_on_batch(next_states), axis=1)) * (1 - done)
        # turn single to double
        #q_target = rewards + self.gamma * (np.amax(self.target_model.predict_on_batch(next_states), axis=1)) * (1 - done)
        
        q_total = self.model.predict_on_batch(states)
        idx = np.array([i for i in range(self.batch_size)])
        q_total[[idx], [actions]] = q_target
        self.model.fit(states, q_total, epochs=1, verbose=0)

    def reshape_data(self, random_sample):
        states = np.array([i[0] for i in random_sample])
        actions = np.array([i[1] for i in random_sample])
        rewards = np.array([i[2] for i in random_sample])
        next_states = np.array([i[3] for i in random_sample])
        done = np.array([i[4] for i in random_sample])
        states = np.squeeze(states)
        next_states = np.squeeze(next_states)
        return np.squeeze(states), actions, rewards, next_states, done


    def save(self, name):
        self.model.save(name)

def test(lr):
    env = gym.make('LunarLander-v2')
    env.seed(21)
    np.random.seed(21)
    lr = lr
    epsilon = 1.0
    epsilon_decay = 0.995
    gamma = 0.99
    training_episodes = 1000
    model = DQN(env, lr, gamma, epsilon, epsilon_decay)
    model.train(training_episodes)
    save_dir = "saved_models"
    model.save(save_dir + str(lr) + "trained_model.h5")
    pickle.dump(model.rewards_list, open(save_dir + str(lr) + "train_rewards_list.p", "wb"))
    return model.rewards_list

#rewards_list1 = test(0.0001)
rewards_list2 = test(0.0003)
#rewards_list3 = test(0.0005)
#rewards_list4 = test(0.001)


## 4. Plot - Single DQN Agent Learning Curve 

In [None]:
#plt.figure()
#plt.title("Learning Rate for single DQN")
#plt.plot(rewards_list1,label = 'LR=0.0001')
#plt.plot(rewards_list2, 'blue',label = 'LR=0.0003')
#plt.plot(rewards_list3, 'yellow',label = 'LR=0.0005')
#plt.plot(rewards_list4, 'green',label = 'LR=0.001')
#plt.legend()
#plt.xlabel("Episode")
#plt.ylabel("Mean_Reward")
#plt.show()


# Lunar Landing - Double DQN

Code adapted from: https://github.com/anh-nn01/Lunar-Lander-Double-Deep-Q-Networks \
Reference paper: https://arxiv.org/abs/1509.06461

## 1. Import Libraries 

In [None]:
import gym

import numpy as np
from collections import deque
import random
import time
from tqdm import tqdm

import tensorflow.compat.v1 as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from tensorflow.keras.optimizers import Adam
tf.disable_v2_behavior() 

import matplotlib.pyplot as plt

## 2. Double DQN Agent  

In [None]:
class Agent:
    def __init__(self, env, gamma, lr, input_size,  number_of_actions):
        
        # Parameters        
        self.gamma = gamma
        self.epsilon = 1.0
        self.lr = lr
        self.batch_size = 64
        self.memory_space = deque(maxlen = 1000000)
        self.input_size = input_size
        self.number_of_actions = number_of_actions
        
        
        # Initialise dense network for policy
        
        self.policy_network = Sequential()
        self.policy_network.add(Dense(128, input_dim = self.input_size, activation = "relu"))
        self.policy_network.add(Dense(128 , activation = "relu"))
        self.policy_network.add(Dense(self.number_of_actions, activation = "linear"))
        self.policy_network.compile(loss = "mean_squared_error", optimizer = Adam(self.lr))
        
        # Initialise dense Target Network
        
        self.target_network = Sequential()
        self.target_network.add(Dense(128, input_dim = self.input_size, activation = "relu"))
        self.target_network.add(Dense(128 , activation = "relu"))
        self.target_network.add(Dense(self.number_of_actions, activation = "linear"))
        self.target_network.compile(loss = "mean_squared_error", optimizer = Adam(self.lr))
         
        self.update_network()
        
       
    def memory_step(self, state, action, reward, next_state, done):
        self.memory_space.append((state, action, reward, next_state, done))
    
    def update_network(self):
        return self.target_network.set_weights(self.policy_network.get_weights())
    
    def choose_action(self, state):
        
        # epsilon greedy aproach
        if np.random.uniform(0.0, 1.0) < self.epsilon:
            action = np.random.choice(self.number_of_actions)
            
        else:
            state = np.reshape(state, [1,self.input_size])
            # show Q values in a state
            Qs = self.policy_network.predict(state) 
            # return action with maximum value
            action = np.argmax(Qs[0])
            
        return action
    
    def learn(self):
        
        # Choose a sample from batch
        current_batch_size = min(len(self.memory_space), self.batch_size)
        sample_batch = random.sample(self.memory_space, current_batch_size)
        
        # preprocessing of samples
        sample_states = np.ndarray(shape = (current_batch_size, self.input_size))
        sample_actions = np.ndarray(shape = (current_batch_size, 1))
        sample_rewards = np.ndarray(shape = (current_batch_size, 1))
        sample_next_states = np.ndarray(shape = (current_batch_size, self.input_size))
        sample_dones = np.ndarray(shape = (current_batch_size, 1))

        index = 0
        
        for exp in sample_batch:
            sample_states[index] = exp[0]
            sample_actions[index] = exp[1]
            sample_rewards[index] = exp[2]
            sample_next_states[index] = exp[3]
            sample_dones[index] = exp[4]
            index += 1
            
        sample_next = self.target_network.predict(sample_next_states)        
        # Q values for terminal states back to 0.
        sample_next = sample_next * (np.ones(shape = sample_dones.shape) - sample_dones)
        # choose max action for each state
        sample_next = np.max(sample_next, axis=1)
        sample_Qs = self.policy_network.predict(sample_states)
        
        for i in range(current_batch_size):
            a = sample_actions[i,0]
            sample_Qs[i,int(a)] = sample_rewards[i] + self.gamma * sample_next[i]
            
        q_target = sample_Qs    
        self.policy_network.fit(sample_states, q_target, epochs = 1, verbose = 0)

## 3. Initialize Lunar Landing Environment and Play Episodes 

In [None]:
# Main loop for playing episodes          
def Main(): 
    env = gym.make("LunarLander-v2")
    agent = Agent(env, gamma = 0.99, lr = 0.0003, input_size = 8, number_of_actions = 4)
    avg = deque(maxlen=100)
    number_of_episodes = 1000
    step=0  
    scores = []
    avg_scores = []
    
    for episode in range(number_of_episodes):        
        done = False
        total = 0
        state = env.reset()
        
        while not done: 
            action = agent.choose_action(state)
            next_state, reward, done, info = env.step(action)
            total+= reward
            env.render()

            agent.memory_step(state, action, reward, next_state, done)
            agent.learn()

            state = next_state
            step +=1
            
        avg.append(total)     
        avg_scores.append(np.mean(avg))
        scores.append(total)
        
        agent.update_network()
        # epsilon decay
        agent.epsilon = max(0.1, 0.995 * agent.epsilon)
        print("Episode:", episode, "Reward:", total)
Main()

## 4. Plot - Double DQN Agent Learning Curve 

In [None]:
#plt.title("Learning Curve - Double DQN")
#plt.xlabel("Episode")
#plt.ylabel("Reward")
#plt.plot(rewards)
#plt.title("Average Learning Curve - Double DQN")
#plt.xlabel("Episode")
#plt.ylabel("Reward")
#plt.plot(aver_reward)

# Lunar Landing - Dueling DQN
Code adapted from:https://github.com/philtabor

## 1. Import libraries

In [None]:
# Restard Kernel before running this part
# otherwise tensorflow might throw an error due to mix of keras imports

import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
import gym
import pickle

## 2. Create Agent

In [None]:
class DuelingDQN(keras.Model):
    
    def __init__(self,number_of_actions):
        super(DuelingDQN, self).__init__()
        
        # Create dense layers with 128 units
        self.first_layer = keras.layers.Dense(128, activation='relu')
        self.second_layer = keras.layers.Dense(128, activation='relu')
        
        # Two layers specific to Dueling DQN architecture
        self.values = keras.layers.Dense(1, activation=None)
        self.advantages = keras.layers.Dense(number_of_actions, activation=None)
    
    # input to output throught a layers  
    def call(self, state):
        
        z = self.first_layer(state)
        z = self.second_layer(z)
        values = self.values(z)
        advantages = self.advantages(z)
        
        # Transformation function to combine output
        Q = (values + (advantages - tf.math.reduce_mean(advantages, axis=1, keepdims=True)))
        
        return Q
    
    def advantage(self, state):
        
        z = self.first_layer(state)
        z = self.second_layer(z)
        advantages = self.advantages(z)
        
        return advantages
    
# Replay buffer realaying on https://github.com/philtabor implementation
class Replay():
    def __init__(self, maximum_size, input_shape):
        
        self.memory_size = maximum_size
        self.memory_counter = 0
        
        self.state_memory = np.zeros((self.memory_size, *input_shape), dtype=np.float32)        
        self.state_memory_new = np.zeros ((self.memory_size, *input_shape), dtype=np.float32)
        
        self.action_memory = np.zeros(self.memory_size, dtype= np.int32)
        self.reward_memory = np.zeros(self.memory_size, dtype= np.float32)
        self.terminal_memory = np.zeros(self.memory_size, dtype = bool)
        
    def transition(self, state, action, reward, state_next, done):
        index = self.memory_counter % self.memory_size
        self.state_memory[index] = state
        self.state_memory_new[index] = state_next
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done
        
        self.memory_counter +=1
        
    def sample(self, batch_size):
        
        max_memory = min(self.memory_counter, self.memory_size)
        batch = np.random.choice(max_memory, batch_size, replace = False)
        
        states = self.state_memory[batch]
        states_next = self.state_memory_new[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        dones = self.terminal_memory[batch]
        
        return states, actions, rewards, states_next, dones
    
    
class Agent():
    def __init__(self, lr, gamma, number_of_actions, epsilon, batch_size, input_size, memory_size = 100000):
        
        # Parameters
        self.actions = [i for i in range(number_of_actions)]
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_decay = 0.001
        self.eps_min = 0.01
        self.replace = 100
        self.batch_size = batch_size
        self.step_counter = 0
        
        self.memory = Replay(memory_size,input_size)
        
        
        # initiate and compile double Q system
        self.q_current = DuelingDQN(number_of_actions)
        self.q_next = DuelingDQN(number_of_actions)

        self.q_current.compile(optimizer=Adam(learning_rate=lr), loss = 'mean_squared_error')
        self.q_next.compile(optimizer=Adam(learning_rate=lr),loss ='mean_squared_error')

    def transition( self, state, action, reward, state_new, done):
        self.memory.transition(state, action, reward, state_new, done)


    def choose_action(self, observation):
        # Epsilon greedy choice
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.actions)
        else:
            state = np.array([observation])
            actions = self.q_current.advantage(state)
            action = tf.math.argmax(actions, axis=1).numpy()[0]
        return action

    def learn(self):
        if self.memory.memory_counter < self.batch_size:
            return

        if self.step_counter % self.replace == 0:
            self.q_next.set_weights(self.q_current.get_weights())
            
        states, actions, rewards, states_new, dones =  self.memory.sample(self.batch_size)

        pred_q = self.q_current(states)
        next_q = self.q_next(states_new)
        target_q = pred_q.numpy()
        max_actions = tf.math.argmax(self.q_current(states_new),axis=1)

        for index, terminal in enumerate(dones):
            target_q[index,actions[index]] = rewards[index]+ \
            self.gamma*next_q[index,max_actions[index]]*(1-int(dones[index]))


        self.q_current.train_on_batch(states, target_q)
        self.epsilon = self.epsilon - self.eps_decay if self.epsilon > \
                        self.eps_min else self.eps_min

        self.step_counter += 1

    
# Main loop for playing episodes          
def Main():
    env = gym.make('LunarLander-v2')
    agent = Agent(lr = 0.0003, gamma = 0.99,number_of_actions = 4, epsilon=1,  batch_size = 64, input_size = [8])
    number_of_episodes = 1000

    scores = []
    a_score = [] 

    for episode in range(number_of_episodes):
        done = False
        score = 0
        state = env.reset()
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done, info = env.step(action)
            score += reward
            agent.transition(state, action, reward, next_state,done)
            state = next_state
            agent.learn()

        scores.append(score)
        avg_score = np.mean(scores[-100:])
        a_score.append(avg_score)
        print('episode ', episode, 'score %.1f' %score,
               'average score %.1f'% avg_score)
        
    f = open('scores.p', 'wb')
    pickle.dump(scores, f)
    f.close()
    
    m = open('avg_scores.p', 'wb')
    pickle.dump(a_score, m)
    m.close()

Main()
     