In [1]:
import numpy as np
import gym

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

We use the Keras RL2 library for the DQN and its training. We import our Cart Pole environment as well from the OpenAI Gym.

In [3]:
env = gym.make('CartPole-v0')
actions = env.action_space.n # Action space, the number of actions we can take in each state, this is 2 (Left, Right)

In [4]:
model = Sequential()
model.add(Flatten(input_shape=(1,env.observation_space.shape[0]))) # Our state space is 4 (pos,vel,angle,rotation)
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(actions))
model.add(Activation('linear'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 4)                 0         
                                                                 
 dense (Dense)               (None, 16)                80        
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 16)                272       
                                                                 
 dense_3 (Dense)             (None, 2)                 34        
                                                                 
 activation (Activation)     (None, 2)                 0         
                                                                 
Total params: 658
Trainable params: 658
Non-trainable pa

Our model is simple with 3 hidden dense layers and an input and output layer. The input layer takes our state space as input which consists of (Cart position, Cart velocity, Angle of pole, Rate of rotation of pole). This model then outputs either an action to the Left or Right for the cart.
3 Dense layers is enough complexity for our problem. 

We now define our policy, the Epsilon Greedy Q Policy which picks the highest expected reward. 
The Sequential Memory helps us store past experiences of our agent so that it may learn from them and use them in subsequent episodes. 
We then make our agent, compile our model and then fit the DQN over 50,000 episodes. 

In [5]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1) # Data structure to store agents past experiences
dqn = DQNAgent(model=model, nb_actions=actions, memory=memory, nb_steps_warmup=50, target_model_update=0.01, policy=policy)
dqn.compile(Adam(learning_rate=0.001), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...
    11/50000: episode: 1, duration: 0.056s, episode steps:  11, steps per second: 197, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: --, mae: --, mean_q: --
    21/50000: episode: 2, duration: 0.006s, episode steps:  10, steps per second: 1756, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.100 [0.000, 1.000],  loss: --, mae: --, mean_q: --
    31/50000: episode: 3, duration: 0.007s, episode steps:  10, steps per second: 1382, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: --, mae: --, mean_q: --
    40/50000: episode: 4, duration: 0.006s, episode steps:   9, steps per second: 1502, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: --, mae: --, mean_q: --
    50/50000: episode: 5, duration: 0.006s, episode steps:  10, steps per second: 1665, episode reward: 10.0

  updates=self.state_updates,
  updates=self.state_updates,


    59/50000: episode: 6, duration: 0.496s, episode steps:   9, steps per second:  18, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: 0.490008, mae: 0.537189, mean_q: 0.336151
    68/50000: episode: 7, duration: 0.058s, episode steps:   9, steps per second: 156, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: 0.383163, mae: 0.494420, mean_q: 0.471404
    79/50000: episode: 8, duration: 0.068s, episode steps:  11, steps per second: 163, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: 0.279492, mae: 0.470645, mean_q: 0.655427
    89/50000: episode: 9, duration: 0.061s, episode steps:  10, steps per second: 164, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.000 [0.000, 0.000],  loss: 0.199859, mae: 0.451024, mean_q: 0.861373
    98/50000: episode: 10, duration: 0.058s, episode steps:   9, ste

   409/50000: episode: 42, duration: 0.057s, episode steps:  10, steps per second: 177, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.300 [0.000, 1.000],  loss: 0.061579, mae: 1.579082, mean_q: 3.081443
   419/50000: episode: 43, duration: 0.058s, episode steps:  10, steps per second: 171, episode reward: 10.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.300 [0.000, 1.000],  loss: 0.070510, mae: 1.588602, mean_q: 3.014932
   430/50000: episode: 44, duration: 0.062s, episode steps:  11, steps per second: 178, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.273 [0.000, 1.000],  loss: 0.063069, mae: 1.619445, mean_q: 3.127813
   439/50000: episode: 45, duration: 0.051s, episode steps:   9, steps per second: 178, episode reward:  9.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.222 [0.000, 1.000],  loss: 0.055716, mae: 1.644322, mean_q: 3.196111
   450/50000: episode: 46, duration: 0.063s, episode steps:  11,

  2663/50000: episode: 77, duration: 1.027s, episode steps: 200, steps per second: 195, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.485 [0.000, 1.000],  loss: 1.491327, mae: 9.556177, mean_q: 19.047083
  2863/50000: episode: 78, duration: 1.031s, episode steps: 200, steps per second: 194, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.495 [0.000, 1.000],  loss: 1.174830, mae: 10.299348, mean_q: 20.609648
  3063/50000: episode: 79, duration: 1.028s, episode steps: 200, steps per second: 194, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 1.320414, mae: 11.044594, mean_q: 22.161993
  3263/50000: episode: 80, duration: 1.029s, episode steps: 200, steps per second: 194, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 1.644631, mae: 11.845074, mean_q: 23.712879
  3463/50000: episode: 81, duration: 1.026s, episode 

  9663/50000: episode: 112, duration: 0.969s, episode steps: 200, steps per second: 206, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.485 [0.000, 1.000],  loss: 3.423090, mae: 29.242886, mean_q: 58.498619
  9863/50000: episode: 113, duration: 0.967s, episode steps: 200, steps per second: 207, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.480 [0.000, 1.000],  loss: 4.987478, mae: 29.704840, mean_q: 59.382275
 10063/50000: episode: 114, duration: 0.982s, episode steps: 200, steps per second: 204, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 3.576928, mae: 29.898775, mean_q: 59.929787
 10263/50000: episode: 115, duration: 0.976s, episode steps: 200, steps per second: 205, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.495 [0.000, 1.000],  loss: 5.108745, mae: 30.498447, mean_q: 61.002262
 10463/50000: episode: 116, duration: 0.974s, ep

 16646/50000: episode: 147, duration: 1.012s, episode steps: 200, steps per second: 198, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.480 [0.000, 1.000],  loss: 4.053677, mae: 35.493999, mean_q: 71.202522
 16846/50000: episode: 148, duration: 1.004s, episode steps: 200, steps per second: 199, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.495 [0.000, 1.000],  loss: 2.313276, mae: 35.730099, mean_q: 71.751579
 17046/50000: episode: 149, duration: 0.991s, episode steps: 200, steps per second: 202, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.485 [0.000, 1.000],  loss: 3.197711, mae: 35.917561, mean_q: 72.014847
 17246/50000: episode: 150, duration: 1.007s, episode steps: 200, steps per second: 199, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.480 [0.000, 1.000],  loss: 3.489936, mae: 35.709126, mean_q: 71.572601
 17446/50000: episode: 151, duration: 0.991s, ep

 23642/50000: episode: 182, duration: 1.146s, episode steps: 200, steps per second: 175, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.495 [0.000, 1.000],  loss: 4.265272, mae: 36.622791, mean_q: 73.061073
 23842/50000: episode: 183, duration: 1.146s, episode steps: 200, steps per second: 174, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.495 [0.000, 1.000],  loss: 3.096060, mae: 36.329411, mean_q: 72.531837
 24042/50000: episode: 184, duration: 1.171s, episode steps: 200, steps per second: 171, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 4.201906, mae: 36.306339, mean_q: 72.475662
 24242/50000: episode: 185, duration: 1.198s, episode steps: 200, steps per second: 167, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 3.098291, mae: 36.370148, mean_q: 72.640930
 24442/50000: episode: 186, duration: 1.144s, ep

 30614/50000: episode: 217, duration: 1.172s, episode steps: 200, steps per second: 171, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.515 [0.000, 1.000],  loss: 4.017107, mae: 36.477436, mean_q: 72.767021
 30814/50000: episode: 218, duration: 1.194s, episode steps: 200, steps per second: 167, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.490 [0.000, 1.000],  loss: 4.270726, mae: 36.429497, mean_q: 72.662621
 31014/50000: episode: 219, duration: 1.226s, episode steps: 200, steps per second: 163, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.525 [0.000, 1.000],  loss: 6.412231, mae: 36.631336, mean_q: 72.935959
 31214/50000: episode: 220, duration: 1.318s, episode steps: 200, steps per second: 152, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.480 [0.000, 1.000],  loss: 5.278707, mae: 36.386211, mean_q: 72.473625
 31414/50000: episode: 221, duration: 1.305s, ep

 36869/50000: episode: 252, duration: 1.183s, episode steps: 200, steps per second: 169, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.520 [0.000, 1.000],  loss: 5.616623, mae: 36.468143, mean_q: 72.853745
 37069/50000: episode: 253, duration: 1.200s, episode steps: 200, steps per second: 167, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.485 [0.000, 1.000],  loss: 5.747537, mae: 36.220020, mean_q: 72.289108
 37225/50000: episode: 254, duration: 1.042s, episode steps: 156, steps per second: 150, episode reward: 156.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.519 [0.000, 1.000],  loss: 7.285596, mae: 36.079193, mean_q: 72.075493
 37425/50000: episode: 255, duration: 1.214s, episode steps: 200, steps per second: 165, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.485 [0.000, 1.000],  loss: 6.399741, mae: 35.947330, mean_q: 71.739090
 37625/50000: episode: 256, duration: 1.268s, ep

 43265/50000: episode: 287, duration: 0.934s, episode steps: 154, steps per second: 165, episode reward: 154.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.526 [0.000, 1.000],  loss: 7.257774, mae: 36.390633, mean_q: 72.767372
 43465/50000: episode: 288, duration: 1.285s, episode steps: 200, steps per second: 156, episode reward: 200.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 5.155315, mae: 36.490818, mean_q: 73.062813
 43632/50000: episode: 289, duration: 1.032s, episode steps: 167, steps per second: 162, episode reward: 167.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.527 [0.000, 1.000],  loss: 7.654600, mae: 36.549721, mean_q: 73.137062
 43775/50000: episode: 290, duration: 0.913s, episode steps: 143, steps per second: 157, episode reward: 143.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.531 [0.000, 1.000],  loss: 3.738839, mae: 36.164364, mean_q: 72.569046
 43975/50000: episode: 291, duration: 1.205s, ep

 48619/50000: episode: 322, duration: 0.781s, episode steps: 126, steps per second: 161, episode reward: 126.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.540 [0.000, 1.000],  loss: 6.516081, mae: 39.964394, mean_q: 80.030144
 48762/50000: episode: 323, duration: 0.898s, episode steps: 143, steps per second: 159, episode reward: 143.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.538 [0.000, 1.000],  loss: 5.480647, mae: 39.705761, mean_q: 79.496971
 48898/50000: episode: 324, duration: 0.841s, episode steps: 136, steps per second: 162, episode reward: 136.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.537 [0.000, 1.000],  loss: 9.340918, mae: 39.956284, mean_q: 79.848572
 49053/50000: episode: 325, duration: 0.986s, episode steps: 155, steps per second: 157, episode reward: 155.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.523 [0.000, 1.000],  loss: 5.108930, mae: 40.029346, mean_q: 80.231750
 49184/50000: episode: 326, duration: 0.793s, ep

<keras.callbacks.History at 0x25ec13be280>

We can now visualize our DQN Agent and see the reward we get. An average reward of above 195 means 
that our problem is considered "solved".
We can also see from our training that it took about 2000 episodes for our DQN to converge. 

In [6]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200


<keras.callbacks.History at 0x25ec13be640>

We run and visualize our model by running the code above and see that our agent has learned to balance the pole. 