In [None]:
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense, LSTM
from keras.optimizers import Adam
from keras.models import Sequential

EPISODES = 500


# DRQN Agent for the Cartpole
# it uses Neural Network to approximate q function
# and replay memory & target q network
class DRQNAgent:
    def __init__(self, state_size, action_size):
        # if you want to see Cartpole learning, then change to True
        self.render = False
        self.load_model = False

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # These are hyper parameters for the DRQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.train_start = 1000
        
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        # initialize target model
        self.update_target_model()

        if self.load_model:
            self.model.load_weights("./save_model/cartpole_drqn.h5")

    # approximate Q function using Neural Network
    # state is input and Q Value of each action is output of network
    def build_model(self):
        model = Sequential()
        model.add(LSTM(32, input_shape=(self.state_size, 2), kernel_initializer='orthogonal', recurrent_initializer='zeros'))
        model.add(Dense(self.action_size))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    # save sample <s,a,r,s'> to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    # pick samples randomly from replay memory (with batch_size)
    def train_model(self):
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size, 2))
        update_target = np.zeros((batch_size, self.state_size, 2))
        action, reward, done = [], [], []

        for i in range(self.batch_size):
            update_input[i] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            update_target[i] = mini_batch[i][3]
            done.append(mini_batch[i][4])

        target = self.model.predict(update_input)
        target_val = self.target_model.predict(update_target)

        for i in range(self.batch_size):
            # Q Learning: get maximum Q value at s' from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.discount_factor * (
                    np.amax(target_val[i]))

        # and do the model fit!      
        self.model.fit(update_input, target, batch_size=self.batch_size,
                       epochs=1, verbose=0)


if __name__ == "__main__":
    # In case of CartPole-v1, maximum length of episode is 500
    env = gym.make('CartPole-v1')
    
    # Total number of states to use
    number_of_states = 8
    
    # get size of state and action from environment
    state_size = env.observation_space.shape[0]
    expanded_state_size = state_size * number_of_states
    action_size = env.action_space.n

    agent = DRQNAgent(expanded_state_size, action_size)

    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        
        # expand the state with past states and initialize
        expanded_state = np.zeros(expanded_state_size)
        expanded_next_state = np.zeros(expanded_state_size)
        for h in range(state_size):
            expanded_state[(h + 1) * number_of_states -1] = state[h]
 
        # reshape states for LSTM input without embedding layer
        reshaped_state = np.zeros((1, expanded_state_size, 2))
        for i in range(expanded_state_size):
            for j in range(2):
                reshaped_state[0, i, j] = expanded_state[i]

        while not done:
            if agent.render:
                env.render()

            # get action for the current state and go one step in environment
            action = agent.get_action(reshaped_state)
            next_state, reward, done, info = env.step(action)
            
            # update the expanded next state with next state values
            for h in range(state_size):
                expanded_next_state[(h + 1) * number_of_states -1] = next_state[h]
            
            # reshape expanded next state for LSTM input without embedding layer
            reshaped_next_state = np.zeros((1, expanded_state_size, 2))
            for i in range(expanded_state_size):
                for j in range(2):
                    reshaped_next_state[0, i, j] = expanded_next_state[i]
                    
            # if an action make the episode end, then gives penalty of -100
            reward = reward if not done or score == 499 else -100

            # save the sample <s, a, r, s'> to the replay memory
            agent.append_sample(reshaped_state, action, reward, reshaped_next_state, done)
            
            # every time step do the training
            agent.train_model()
            score += reward
            reshaped_state = reshaped_next_state
            
            # Shifting past state elements to the left by one
            expanded_next_state = np.roll(expanded_next_state, -1)

            if done:
                # every episode update the target model to be same with model
                agent.update_target_model()

                # every episode, plot the play time
                score = score if score == 500 else score + 100
                scores.append(score)
                episodes.append(e)
                pylab.plot(episodes, scores, 'b')
                pylab.savefig("./save_graph/cartpole_drqn.png")
                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon)

            # if the mean of scores of last 10 episode is bigger than 490
            # stop training
            # revised to exit cleanly on Jupiter notebook
            if np.mean(scores[-min(10, len(scores)):]) > 490:
                #sys.exit()
                env.close()
                break

        # save the model
        if e % 50 == 0:
            agent.model.save_weights("./save_model/cartpole_drqn.h5")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


episode: 0   score: 19.0   memory length: 20   epsilon: 0.9801888648295347
episode: 1   score: 27.0   memory length: 48   epsilon: 0.9531108968798944
episode: 2   score: 34.0   memory length: 83   epsilon: 0.9203129279589385
episode: 3   score: 21.0   memory length: 105   epsilon: 0.9002772252562138
episode: 4   score: 21.0   memory length: 127   epsilon: 0.8806777104745716
episode: 5   score: 13.0   memory length: 141   epsilon: 0.8684280445126921
episode: 6   score: 12.0   memory length: 154   epsilon: 0.857205969570888
episode: 7   score: 19.0   memory length: 174   epsilon: 0.8402237462387894
episode: 8   score: 12.0   memory length: 187   epsilon: 0.8293661352855802
episode: 9   score: 13.0   memory length: 201   epsilon: 0.8178301806491574
episode: 10   score: 9.0   memory length: 211   epsilon: 0.8096885832327116
episode: 11   score: 8.0   memory length: 220   epsilon: 0.8024304668606914
episode: 12   score: 11.0   memory length: 232   epsilon: 0.7928540855310416
episode: 13   s

episode: 107   score: 11.0   memory length: 1539   epsilon: 0.21443041188095732
episode: 108   score: 13.0   memory length: 1553   epsilon: 0.21144782144365132
episode: 109   score: 10.0   memory length: 1564   epsilon: 0.20913349021874023
episode: 110   score: 8.0   memory length: 1573   epsilon: 0.20725880007153075
episode: 111   score: 9.0   memory length: 1583   epsilon: 0.20519551388923485
episode: 112   score: 10.0   memory length: 1594   epsilon: 0.2029496152000772
episode: 113   score: 9.0   memory length: 1604   epsilon: 0.20092922746937492
episode: 114   score: 13.0   memory length: 1618   epsilon: 0.1981344299069931
episode: 115   score: 9.0   memory length: 1628   epsilon: 0.19616197792269574
episode: 116   score: 8.0   memory length: 1637   epsilon: 0.19440356549968224
episode: 117   score: 9.0   memory length: 1647   epsilon: 0.19246825471748086
episode: 118   score: 8.0   memory length: 1656   epsilon: 0.19074295313908668
episode: 119   score: 7.0   memory length: 1664  

episode: 211   score: 9.0   memory length: 2000   epsilon: 0.07098243109022996
episode: 212   score: 8.0   memory length: 2000   epsilon: 0.07034613862434776
episode: 213   score: 8.0   memory length: 2000   epsilon: 0.06971554993749822
episode: 214   score: 9.0   memory length: 2000   epsilon: 0.06902152328662714
episode: 215   score: 9.0   memory length: 2000   epsilon: 0.06833440575420312
episode: 216   score: 8.0   memory length: 2000   epsilon: 0.06772185040953389
episode: 217   score: 8.0   memory length: 2000   epsilon: 0.06711478606235186
episode: 218   score: 10.0   memory length: 2000   epsilon: 0.06638020367707664
episode: 219   score: 19.0   memory length: 2000   epsilon: 0.06506513648938707
episode: 220   score: 8.0   memory length: 2000   epsilon: 0.06448188714861476
episode: 221   score: 10.0   memory length: 2000   epsilon: 0.06377612227551108
episode: 222   score: 9.0   memory length: 2000   epsilon: 0.06314122333850061
episode: 223   score: 9.0   memory length: 2000  

episode: 315   score: 9.0   memory length: 2000   epsilon: 0.02435890620074816
episode: 316   score: 8.0   memory length: 2000   epsilon: 0.024140550922482684
episode: 317   score: 8.0   memory length: 2000   epsilon: 0.023924152999245944
episode: 318   score: 8.0   memory length: 2000   epsilon: 0.02370969488514328
episode: 319   score: 8.0   memory length: 2000   epsilon: 0.02349715919156292
episode: 320   score: 7.0   memory length: 2000   epsilon: 0.02330983852429035
episode: 321   score: 15.0   memory length: 2000   epsilon: 0.02293966527733732
episode: 322   score: 15.0   memory length: 2000   epsilon: 0.022575370588170832
episode: 323   score: 11.0   memory length: 2000   epsilon: 0.022305951160147018
episode: 324   score: 9.0   memory length: 2000   epsilon: 0.022083892744312248
episode: 325   score: 9.0   memory length: 2000   epsilon: 0.021864044946607545
episode: 326   score: 8.0   memory length: 2000   epsilon: 0.021668053813878495
episode: 327   score: 8.0   memory length:

episode: 418   score: 8.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 419   score: 9.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 420   score: 10.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 421   score: 8.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 422   score: 8.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 423   score: 8.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 424   score: 9.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 425   score: 8.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 426   score: 8.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 427   score: 8.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 428   score: 7.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 429   score: 8.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 430   score: 9.0   memory leng

episode: 521   score: 8.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 522   score: 8.0   memory length: 2000   epsilon: 0.009998671593271896
episode: 523   score: 10.0   memory length: 2000   epsilon: 0.009998671593271896
