# Double Deep Q-Network

Referred Paper: https://arxiv.org/pdf/1509.06461.pdf

**Environment:** CartPole-v1

**Actions:** 0 to push the cart to the left 
         1 to push the cart to the right.  

To "solve" this puzzle we have to have an average reward of > 195 over 100 consecutive episodes. 

In [None]:
#Imports and gym creation
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import tensorflow as tf
from tensorflow import keras
import random

#Create Gym
from gym import wrappers
envCartPole = gym.make('CartPole-v1')

In [None]:
EPISODES = 500
TRAIN_END = 0

def discount_rate(): #Gamma
    return 0.95

def learning_rate(): #Alpha
    return 0.001

def batch_size():
    return 24

nS = envCartPole.observation_space.shape[0] #This is only 4
nA = envCartPole.action_space.n #Actions

batch_size = batch_size()

In [None]:
class DoubleDeepQNetwork():
    def __init__(self, states, actions, alpha, gamma, epsilon,epsilon_min, epsilon_decay):
        self.nS = states
        self.nA = actions
        self.memory = deque([], maxlen=2500)
        self.alpha = alpha
        self.gamma = gamma
        #Explore/Exploit
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.model = self.build_model()
        self.model_target = self.build_model() #Second (target) neural network
        self.update_target_from_model() #Update weights
        self.loss = []

    def build_model(self):
        model = keras.Sequential()
        model.add(keras.layers.Dense(24, input_dim=self.nS, activation='relu'))
        model.add(keras.layers.Dense(24, activation='relu'))
        model.add(keras.layers.Dense(self.nA, activation='linear'))
        model.compile(loss='mean_squared_error', optimizer=keras.optimizers.Adam(lr=self.alpha))
        return model

    def update_target_from_model(self):
        self.model_target.set_weights( self.model.get_weights() )

    def action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.nA) #Explore
        action_vals = self.model.predict(state) #Exploit
        return np.argmax(action_vals[0])

    def test_action(self, state): #Exploit
        action_vals = self.model.predict(state)
        return np.argmax(action_vals[0])

    def store(self, state, action, reward, nstate, done):
        #Store the experience in memory
        self.memory.append( (state, action, reward, nstate, done) )

    def experience_replay(self, batch_size):
        #Execute the experience replay
        minibatch = random.sample( self.memory, batch_size ) #Randomly sample from memory

        x = []
        y = []
        np_array = np.array(minibatch)
        st = np.zeros((0,self.nS)) #States
        nst = np.zeros( (0,self.nS) )#Next States
        for i in range(len(np_array)): #Creating the state and next state np arrays
            st = np.append( st, np_array[i,0], axis=0)
            nst = np.append( nst, np_array[i,3], axis=0)
        st_predict = self.model.predict(st)
        nst_predict = self.model.predict(nst)
        nst_predict_target = self.model_target.predict(nst)
        index = 0
        for state, action, reward, nstate, done in minibatch:
            x.append(state)
            nst_action_predict_target = nst_predict_target[index]
            nst_action_predict_model = nst_predict[index]
            if done == True: #Terminal state
                target = reward
            else:   #Non terminal
                target = reward + self.gamma * nst_action_predict_target[np.argmax(nst_action_predict_model)] #Using Q to get T is Double DQN
            target_f = st_predict[index]
            target_f[action] = target
            y.append(target_f)
            index += 1

        x_reshape = np.array(x).reshape(batch_size,self.nS)
        y_reshape = np.array(y)
        epoch_count = 1
        hist = self.model.fit(x_reshape, y_reshape, epochs=epoch_count, verbose=0)
        for i in range(epoch_count):
            self.loss.append( hist.history['loss'][i] )
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [None]:
dqn = DoubleDeepQNetwork(nS, nA, learning_rate(), discount_rate(), 1, 0.001, 0.995 )

#Training
rewards = [] #Store rewards for graphing
epsilons = [] # Store the Explore/Exploit
TEST_Episodes = 0
for e in range(EPISODES):
    state = envCartPole.reset()
    state = np.reshape(state, [1, nS]) # Resize to store in memory to pass to .predict
    tot_rewards = 0
    for time in range(200):
        action = dqn.action(state)
        nstate, reward, done, _ = envCartPole.step(action)
        nstate = np.reshape(nstate, [1, nS])
        tot_rewards += reward
        dqn.store(state, action, reward, nstate, done) # Resize to store in memory to pass to .predict
        state = nstate
        #done: CartPole fell.
        #time == 199: CartPole stayed upright
        if done or time == 199:
            rewards.append(tot_rewards)
            epsilons.append(dqn.epsilon)
            print("episode: {}/{}, \t score: {}, \t e: {}"
                  .format(e, EPISODES, tot_rewards, dqn.epsilon))
            break
        #Experience Replay
        if len(dqn.memory) > batch_size:
            dqn.experience_replay(batch_size)
    #Update the weights after each episode (You can configure this for x steps as well
    dqn.update_target_from_model()
    #If our current NN passes we are done
    #I am going to use the last 5 runs
    # if len(rewards) > 5 and np.average(rewards[-5:]) > 195:
    #     #Set the rest of the EPISODES for testing
    #     TEST_Episodes = EPISODES - e
    #     TRAIN_END = e
    #     break

In [None]:
# #Testing
# print('Training complete. Testing started...')
# #TEST Time
# #   In this section we ALWAYS use exploit don't train any more
# for e_test in range(TEST_Episodes):
#     state = envCartPole.reset()
#     state = np.reshape(state, [1, nS])
#     tot_rewards = 0
#     for t_test in range(210):
#         action = dqn.test_action(state)
#         nstate, reward, done, _ = envCartPole.step(action)
#         nstate = np.reshape( nstate, [1, nS])
#         tot_rewards += reward
#         #DON'T STORE ANYTHING DURING TESTING
#         state = nstate
#         #done: CartPole fell.
#         #t_test == 209: CartPole stayed upright
#         if done or t_test == 209:
#             rewards.append(tot_rewards)
#             epsilons.append(0) #We are doing full exploit
#             print("episode: {}/{}, \t score: {}, \t e: {}"
#                   .format(e_test, TEST_Episodes, tot_rewards, 0))
#             break;

In [None]:
rolling_average = np.convolve(rewards, np.ones(100)/100)

plt.figure(figsize=(10, 6))
plt.plot(rewards, label='Rewards')
plt.plot(rolling_average, color='black', label='Average Scores')
eps_graph = [200*x for x in epsilons]
plt.plot(eps_graph, color='g', linestyle='-', label='Epsilon (0.001 - 1.0)')
plt.xlim((0, EPISODES))
plt.ylim((0, 220))
plt.legend()
plt.show()
