In [1]:
"""
Created on 20190316

DQN practice

Ref: 
1. https://keon.io/deep-q-learning/
2. https://zhuanlan.zhihu.com/p/26985029
"""

'\nCreated on 20190316\n\nDQN practice\n\nRef: https://keon.io/deep-q-learning/\n'

![title](DQN.png)

# Import

In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


# Build an Agent

In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        # 滿了，最老的會被pop掉，補上新的@@
        self.memory = deque(maxlen=2000)  # like a list and shines at 'appendleft' and 'popleft'
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999
        self.learning_rate = 0.001
        self.model = self._build_model()
        
    def _build_model(self):
        # Neural Net for Deep Q Learning
        # Sequential() creates the foundation of the layers.
        model = Sequential()
        # 'Dense' is the basic form of a neural network layer
        # Input Layer of state size(4) and Hidden Layer with 24 nodes
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        # Hidden layer with 24 nodes
        model.add(Dense(24, activation='relu'))
        # Output Layer with # of actions: 2 nodes (left, right)
        model.add(Dense(self.action_size, activation='linear'))
        # Create the model based on the information above
        model.compile(loss='mse',
                  optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def replay(self, batch_size):
        # Sample minibatch from the memory
        minibatch = random.sample(self.memory, batch_size)
        # Extract informations from each memory
        for state, action, reward, next_state, done in minibatch:
            # if done, make our target reward
            target_q = reward
            if not done:
              # predict the future discounted reward
              target_q = reward + self.gamma * \
                       np.amax(self.model.predict(next_state)[0])
            # make the agent to approximately map
            # the current state to future discounted reward
            # We'll call that current_q
            current_q = self.model.predict(state)
            current_q[0][action] = target_q
            # Train the Neural Net with the state and current_q
            self.model.fit(state, current_q, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

# Training

In [None]:
# Agent interacts with Env, so the class doesn't need to contain the class of env
env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
done = False
batch_size = 32
EPISODES = 1000

for e in range(EPISODES):
    state = env.reset()  # initial state
    state = np.expand_dims(state, axis=0)
    for t in range(2000):
        # env.render() 還沒安裝相關pkg，不能秀
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10  # done就是輸
        next_state = np.expand_dims(next_state, axis=0)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, t, agent.epsilon))
            break
        # start experience replaying after batch_size
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

episode: 0/1000, score: 20, e: 1.0
episode: 1/1000, score: 20, e: 0.99
episode: 2/1000, score: 11, e: 0.98
episode: 3/1000, score: 20, e: 0.96
episode: 4/1000, score: 30, e: 0.93
episode: 5/1000, score: 25, e: 0.91
episode: 6/1000, score: 23, e: 0.89
episode: 7/1000, score: 11, e: 0.88
episode: 8/1000, score: 27, e: 0.86
episode: 9/1000, score: 10, e: 0.85
episode: 10/1000, score: 35, e: 0.82
episode: 11/1000, score: 33, e: 0.79
episode: 12/1000, score: 21, e: 0.77
episode: 13/1000, score: 19, e: 0.76
episode: 14/1000, score: 11, e: 0.75
episode: 15/1000, score: 13, e: 0.74
episode: 16/1000, score: 23, e: 0.73
episode: 17/1000, score: 38, e: 0.7
episode: 18/1000, score: 36, e: 0.67
episode: 19/1000, score: 15, e: 0.66
episode: 20/1000, score: 31, e: 0.64
episode: 21/1000, score: 33, e: 0.62
episode: 22/1000, score: 26, e: 0.61
episode: 23/1000, score: 83, e: 0.56
episode: 24/1000, score: 32, e: 0.54
episode: 25/1000, score: 109, e: 0.48
episode: 26/1000, score: 141, e: 0.42
episode: 27

episode: 217/1000, score: 94, e: 0.01
episode: 218/1000, score: 102, e: 0.01
episode: 219/1000, score: 88, e: 0.01
episode: 220/1000, score: 107, e: 0.01
episode: 221/1000, score: 112, e: 0.01
episode: 222/1000, score: 153, e: 0.01
episode: 223/1000, score: 8, e: 0.01
episode: 224/1000, score: 9, e: 0.01
episode: 225/1000, score: 10, e: 0.01
episode: 226/1000, score: 17, e: 0.01
episode: 227/1000, score: 9, e: 0.01
episode: 228/1000, score: 8, e: 0.01
episode: 229/1000, score: 47, e: 0.01
episode: 230/1000, score: 123, e: 0.01
episode: 231/1000, score: 120, e: 0.01
episode: 232/1000, score: 111, e: 0.01
episode: 233/1000, score: 85, e: 0.01
episode: 234/1000, score: 62, e: 0.01
episode: 235/1000, score: 114, e: 0.01
episode: 236/1000, score: 109, e: 0.01
episode: 237/1000, score: 25, e: 0.01
episode: 238/1000, score: 107, e: 0.01
episode: 239/1000, score: 100, e: 0.01
episode: 240/1000, score: 46, e: 0.01
episode: 241/1000, score: 131, e: 0.01
episode: 242/1000, score: 46, e: 0.01
epis

In [None]:
env.render()

# Environment

In [7]:
# initialize gym environment and the agent
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
print(state_size)
print(action_size)
print(env.reward_range)

(4,)
2
(-inf, inf)


In [94]:
# env.reset()
next_state, reward, done, _ = env.step(1)
print(next_state, reward, done)
print(env.step(0))

[ 0.16507484  0.28030365 -0.39574315 -1.69611162] 0.0 True
(array([ 0.17068091,  0.09125663, -0.42966538, -1.5477934 ]), 0.0, True, {})
