In [1]:
# OpenGym CartPole-v0
# -------------------
# This code demonstrates use a full DQN implementation
# to solve OpenGym CartPole-v0 problem.
#
# Made as part of blog series Let's make a DQN, available at: 
# https://jaromiru.com/2016/09/27/lets-make-a-dqn-theory/
# 
# author: Jaromir Janisch, 2016

import random, numpy, math, gym, sys
from keras import backend as K
import tensorflow as tf  

#----------
HUBER_LOSS_DELTA = 1.0
LEARNING_RATE = 0.00025

#----------
def huber_loss(y_true, y_pred):
    err = y_true - y_pred
    cond = K.abs(err) < HUBER_LOSS_DELTA
    L2 = 0.5 * K.square(err)
    L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)
    loss = tf.where(cond, L2, L1)   # Keras does not cover where function in tensorflow 
    return K.mean(loss)

Using TensorFlow backend.


STATE_COUNT = 4: 
* $x$ - Position
* $\dot{x}$ - Velocity
* $\theta$ - Pole Angle
* $\dot{\theta}$ - Ang. Velocity

ACTION_COUNT = 2: 
* `LEFT` 
* `RIGHT`

**Brain**
---

$\\\$

The **`Brain`** class encapsulates the neural network. Our problem is simple enough so we will use only one hidden layer of 64 neurons, with ReLU activation function. The final layer will consist of only two neurons, one for each available action. Their activation function will be linear. Remember that we are trying to approximate the Q function, which in essence can be of any real value. Therefore we can’t restrict the output from the network and the linear activation works well.

Instead of simple gradient descent, we will use a more sophisticated algorithm RMSprop, and Mean Squared Error (mse) loss function.

In [2]:
#-------------------- BRAIN ---------------------------
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *

class Brain:
    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.model = self._createModel()
        self.model_ = self._createModel() 

    def _createModel(self):
        model = Sequential()

        model.add(Dense(units=64, activation='relu', input_dim=stateCnt))
        model.add(Dense(units=actionCnt, activation='linear'))

        opt = RMSprop(lr=LEARNING_RATE)
        model.compile(loss=huber_loss, optimizer=opt)

        return model

    def train(self, x, y, epochs=1, verbose=0):
        self.model.fit(x, y, batch_size=64, epochs=epochs, verbose=verbose)

    def predict(self, s, target=False):
        if target:
            return self.model_.predict(s)
        else:
            return self.model.predict(s)

    def predictOne(self, s, target=False):
        return self.predict(s.reshape(1, self.stateCnt), target=target).flatten()

    def updateTargetModel(self):
        self.model_.set_weights(self.model.get_weights())

**Memory**
---

$\\\$

The purpose of the **`Memory`** class is to store experience (i.e., different states, actions and rewards). Somewhat superfluous for the current problem, but we will implement it anyway. It is a good abstraction for the experience replay part and will allow us to easily upgrade it to more sophisticated algorithms later on.

The add(sample) method stores the experience into the internal array, making sure that it does not exceed its capacity. The other method sample(n) returns n random samples from the memory.

In [3]:
#-------------------- MEMORY --------------------------
class Memory:   # stored as ( s, a, r, s_ )
    samples = []

    def __init__(self, capacity):
        self.capacity = capacity

    def add(self, sample):
        self.samples.append(sample)        

        if len(self.samples) > self.capacity:
            self.samples.pop(0)

    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)

    def isFull(self):
        return len(self.samples) >= self.capacity

**Agent**
---

$\\\$

Finally, the **`Agent`** class acts as a container for the agent related properties and methods. In other words, it uses the **`Brain`** and **`Memory`** to replay the past actions to choose optimal set of actions that maximize the rewards. 

The act(s) method implements the ε-greedy policy. With probability epsilon, it chooses a random action, otherwise it selects the best action the current ANN returns. We decrease the epsilon parameter with time, according to a formula:

$$\varepsilon = \varepsilon_{min} + (\varepsilon_{max} - \varepsilon_{min}) e^{-\lambda t}$$

The λ parameter controls the speed of decay. This way we start with a policy that explores greatly and behaves more and more greedily over time.

The observe(sample) method simply adds a sample to the agent’s memory.
```python
def observe(self, sample):  # in (s, a, r, s_) format
    self.memory.add(sample)   
```
The last replay() method is the most complicated part. Let’s recall, how the update formula looks like:

$$Q(s, a) \xrightarrow{} r + \gamma max_a Q(s', a)$$

This formula means that for a sample (s, r, a, s’) we will update the network’s weights so that its output is closer to the target. But when we recall our network architecture, we see, that it has multiple outputs, one for each action.

In [4]:
#-------------------- AGENT ---------------------------
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64

GAMMA = 0.99

MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.001      # speed of decay

UPDATE_TARGET_FREQUENCY = 1000

class Agent:
    steps = 0
    epsilon = MAX_EPSILON

    def __init__(self, stateCnt, actionCnt):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.brain = Brain(stateCnt, actionCnt)
        self.memory = Memory(MEMORY_CAPACITY)
        
    def act(self, s):
        if random.random() < self.epsilon:
            return random.randint(0, self.actionCnt-1)
        else:
            return numpy.argmax(self.brain.predictOne(s))

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)        

        if self.steps % UPDATE_TARGET_FREQUENCY == 0:
            self.brain.updateTargetModel()

        # debug the Q function in poin S
        if self.steps % 100 == 0:
            S = numpy.array([-0.01335408, -0.04600273, -0.00677248, 0.01517507])
            pred = agent.brain.predictOne(S)
            print(pred[0])
            sys.stdout.flush()

        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)

    def replay(self):    
        batch = self.memory.sample(BATCH_SIZE)
        batchLen = len(batch)

        no_state = numpy.zeros(self.stateCnt)

        states = numpy.array([ o[0] for o in batch ])
        states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ])

        p = agent.brain.predict(states)
        p_ = agent.brain.predict(states_, target=True)

        x = numpy.zeros((batchLen, self.stateCnt))
        y = numpy.zeros((batchLen, self.actionCnt))
        
        for i in range(batchLen):
            o = batch[i]
            s = o[0]; a = o[1]; r = o[2]; s_ = o[3]
            
            t = p[i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + GAMMA * numpy.amax(p_[i])

            x[i] = s
            y[i] = t

        self.brain.train(x, y)

In [5]:
class RandomAgent:
    memory = Memory(MEMORY_CAPACITY)

    def __init__(self, actionCnt):
        self.actionCnt = actionCnt

    def act(self, s):
        return random.randint(0, self.actionCnt-1)

    def observe(self, sample):  # in (s, a, r, s_) format
        self.memory.add(sample)

    def replay(self):
        pass

The **`Environment`** class is our wrapper for OpenAI Gym. Its only method **`run()`** handles one episode of the problem.

In [6]:
#-------------------- ENVIRONMENT ---------------------
class Environment:
    def __init__(self, problem):
        self.problem = problem
        self.env = gym.make(problem)

    def run(self, agent):
        s = self.env.reset()
        R = 0 

        while True:            
            # self.env.render()
            a = agent.act(s)
            s_, r, done, info = self.env.step(a)
            if done: # terminal state
                s_ = None
            agent.observe( (s, a, r, s_) )
            agent.replay()            
            s = s_
            R += r

            if done:
                break

        # print("Total reward:", R)

In [None]:
#-------------------- MAIN ----------------------------
PROBLEM = 'CartPole-v0'
env = Environment(PROBLEM)

stateCnt  = env.env.observation_space.shape[0]
actionCnt = env.env.action_space.n

agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)

try:
    while randomAgent.memory.isFull() == False:
        env.run(randomAgent)

    agent.memory.samples = randomAgent.memory.samples
    randomAgent = None

    while True:
        env.run(agent)
finally:
    agent.brain.model.save("cartpole-dqn.h5")

[2017-09-19 05:57:03,494] Making new env: CartPole-v0


0.00289699
0.146952
0.320846
0.520238
0.72994
0.89885
0.990433
1.01279
1.00741
1.00054
1.00223
1.36171
1.70675
1.93112
1.99481
2.00397
2.02538
2.0143
2.01188
2.02313
2.02382
2.51325
2.91009
2.99918
3.02664
3.03962
3.05555
3.04044
3.07461
3.06479
3.04982
3.63391
3.95845
4.00072
4.02021
4.05129
4.04684
4.0516
4.04942
4.06363
4.05013
4.70412
4.98548
5.02828
5.04538
5.04682
5.06287
5.0628
5.04554
5.05778
5.06179
5.77562
5.98318
6.03525
6.03726
6.04973
6.05637
6.04604
6.03134
6.05525
6.02647
6.77821
6.96946
7.00719
7.00345
7.0069
7.00778
7.00314
7.00316
7.01888
7.02535
7.79735
7.94143
7.97455
7.97261
7.9644
7.98356
7.95961
7.98259
7.96198
7.97932
8.78109
8.9306
8.8924
8.9033
8.9432
8.917
8.92931
8.93102
8.91284
8.92731
9.76465
9.85597
9.86756
9.84658
9.87691
9.8781
9.88361
9.89099
9.86533
9.8827
10.7261
10.8095
10.8432
10.81
10.8652
10.818
10.8247
10.8239
10.8058
10.8072
11.6604
11.7147
11.7277
11.7351
11.7086
11.7243
11.7311
11.7137
11.728
11.7298
12.5826
12.6216
12.634
12.6473
12.6338
12.