In [3]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory


In [7]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n


model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())



policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=10000, visualize=True, verbose=2)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_5 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_9 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_9 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 2)                 34        
_________________________________________________________________
activation_10 (Activation)   (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None
Training for 10000 steps ...




   79/10000: episode: 1, duration: 0.807s, episode steps: 79, steps per second: 98, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.060 [-0.402, 0.722], loss: 0.427174, mean_absolute_error: 0.495048, mean_q: 0.053482
  113/10000: episode: 2, duration: 0.083s, episode steps: 34, steps per second: 412, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.151 [-0.159, 0.753], loss: 0.355554, mean_absolute_error: 0.448903, mean_q: 0.190881
  165/10000: episode: 3, duration: 0.128s, episode steps: 52, steps per second: 406, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.088 [-0.295, 0.673], loss: 0.317111, mean_absolute_error: 0.469502, mean_q: 0.321678
  199/10000: episode: 4, duration: 0.084s, episode steps: 34, steps per second: 404, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean 

  682/10000: episode: 30, duration: 0.059s, episode steps: 12, steps per second: 203, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.109 [-1.134, 1.958], loss: 0.370086, mean_absolute_error: 2.246638, mean_q: 4.363328
  693/10000: episode: 31, duration: 0.036s, episode steps: 11, steps per second: 308, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.118 [-1.403, 2.217], loss: 0.478574, mean_absolute_error: 2.354550, mean_q: 4.468948
  702/10000: episode: 32, duration: 0.022s, episode steps: 9, steps per second: 417, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.130 [-1.418, 2.303], loss: 0.364175, mean_absolute_error: 2.344522, mean_q: 4.572275
  713/10000: episode: 33, duration: 0.032s, episode steps: 11, steps per second: 339, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], me

 1061/10000: episode: 64, duration: 0.041s, episode steps: 16, steps per second: 393, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.093 [-0.739, 1.180], loss: 0.993735, mean_absolute_error: 3.706347, mean_q: 6.887917
 1078/10000: episode: 65, duration: 0.042s, episode steps: 17, steps per second: 401, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.412 [0.000, 1.000], mean observation: 0.068 [-0.807, 1.446], loss: 1.067996, mean_absolute_error: 3.756943, mean_q: 6.999496
 1092/10000: episode: 66, duration: 0.035s, episode steps: 14, steps per second: 396, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.094 [-0.768, 1.269], loss: 1.269140, mean_absolute_error: 3.808263, mean_q: 6.965366
 1107/10000: episode: 67, duration: 0.040s, episode steps: 15, steps per second: 376, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], 

 1940/10000: episode: 96, duration: 0.030s, episode steps: 11, steps per second: 368, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.909 [0.000, 1.000], mean observation: -0.103 [-2.790, 1.807], loss: 1.641430, mean_absolute_error: 6.028032, mean_q: 11.525229
 1948/10000: episode: 97, duration: 0.022s, episode steps: 8, steps per second: 358, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.148 [-2.509, 1.552], loss: 2.242553, mean_absolute_error: 6.099572, mean_q: 11.630013
 1969/10000: episode: 98, duration: 0.051s, episode steps: 21, steps per second: 412, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: -0.066 [-0.951, 0.617], loss: 2.497692, mean_absolute_error: 6.174944, mean_q: 11.618462
 1993/10000: episode: 99, duration: 0.055s, episode steps: 24, steps per second: 439, episode reward: 24.000, mean reward: 1.000 [1.000, 1.00

 2710/10000: episode: 125, duration: 0.135s, episode steps: 53, steps per second: 392, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.095 [-0.226, 0.874], loss: 2.874843, mean_absolute_error: 7.626862, mean_q: 14.511334
 2767/10000: episode: 126, duration: 0.134s, episode steps: 57, steps per second: 426, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.135 [-0.844, 0.397], loss: 3.303938, mean_absolute_error: 7.700473, mean_q: 14.555192
 2826/10000: episode: 127, duration: 0.135s, episode steps: 59, steps per second: 439, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.115 [-0.687, 0.242], loss: 3.444885, mean_absolute_error: 7.847892, mean_q: 14.850730
 2878/10000: episode: 128, duration: 0.132s, episode steps: 52, steps per second: 393, episode reward: 52.000, mean reward: 1.000 [1.000,

 4860/10000: episode: 155, duration: 0.467s, episode steps: 200, steps per second: 428, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.042 [-0.428, 0.540], loss: 4.020002, mean_absolute_error: 10.920676, mean_q: 21.175678
 4976/10000: episode: 156, duration: 0.285s, episode steps: 116, steps per second: 407, episode reward: 116.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.440 [0.000, 1.000], mean observation: -0.478 [-2.578, 0.456], loss: 4.533919, mean_absolute_error: 11.176677, mean_q: 21.643230
 5029/10000: episode: 157, duration: 0.122s, episode steps: 53, steps per second: 436, episode reward: 53.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.547 [0.000, 1.000], mean observation: 0.144 [-0.233, 0.894], loss: 3.741907, mean_absolute_error: 11.187013, mean_q: 21.764019
 5141/10000: episode: 158, duration: 0.267s, episode steps: 112, steps per second: 420, episode reward: 112.000, mean reward: 1.000

 9010/10000: episode: 184, duration: 0.440s, episode steps: 177, steps per second: 402, episode reward: 177.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.334 [-2.166, 0.511], loss: 5.134741, mean_absolute_error: 17.117964, mean_q: 33.973038
 9210/10000: episode: 185, duration: 0.483s, episode steps: 200, steps per second: 414, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.016 [-0.614, 0.548], loss: 6.052817, mean_absolute_error: 17.458164, mean_q: 34.608932
 9410/10000: episode: 186, duration: 0.513s, episode steps: 200, steps per second: 390, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.111 [-0.655, 0.734], loss: 5.100508, mean_absolute_error: 17.789614, mean_q: 35.441490
 9571/10000: episode: 187, duration: 0.392s, episode steps: 161, steps per second: 411, episode reward: 161.000, mean reward: 1.0

<keras.callbacks.History at 0x7f0dcd0ad8d0>