In [5]:
# Keras-RL

# Packages: pip install keras-rl tensorflow gym

import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [None]:
# Environment init and extract the number of actions.
env = gym.make('CartPole-v0')
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [7]:
# Agent
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [None]:
# Training
dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)
print("Training Done!")

Training for 50000 steps ...
Instructions for updating:
Use tf.cast instead.




    31/50000: episode: 1, duration: 3.851s, episode steps: 31, steps per second: 8, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.419 [0.000, 1.000], mean observation: 0.013 [-1.185, 1.776], loss: 0.461963, mean_absolute_error: 0.519561, mean_q: 0.093668
    44/50000: episode: 2, duration: 0.108s, episode steps: 13, steps per second: 120, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.069 [-1.867, 1.224], loss: 0.354955, mean_absolute_error: 0.539455, mean_q: 0.278580




    64/50000: episode: 3, duration: 0.179s, episode steps: 20, steps per second: 112, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.070 [-1.375, 0.833], loss: 0.232410, mean_absolute_error: 0.554170, mean_q: 0.486509
    83/50000: episode: 4, duration: 0.238s, episode steps: 19, steps per second: 80, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.421 [0.000, 1.000], mean observation: 0.074 [-0.817, 1.505], loss: 0.112918, mean_absolute_error: 0.605069, mean_q: 0.837883
    98/50000: episode: 5, duration: 0.101s, episode steps: 15, steps per second: 148, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.090 [-1.368, 0.825], loss: 0.064757, mean_absolute_error: 0.681580, mean_q: 1.132854
   114/50000: episode: 6, duration: 0.086s, episode steps: 16, steps per second: 187, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000],

   707/50000: episode: 33, duration: 0.159s, episode steps: 25, steps per second: 157, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.055 [-1.151, 1.832], loss: 0.208872, mean_absolute_error: 3.057328, mean_q: 5.917589
   782/50000: episode: 34, duration: 0.472s, episode steps: 75, steps per second: 159, episode reward: 75.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.190 [-1.448, 0.992], loss: 0.278138, mean_absolute_error: 3.232235, mean_q: 6.206750
   802/50000: episode: 35, duration: 0.119s, episode steps: 20, steps per second: 168, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.083 [-0.614, 1.054], loss: 0.272746, mean_absolute_error: 3.462800, mean_q: 6.753606
   851/50000: episode: 36, duration: 0.273s, episode steps: 49, steps per second: 179, episode reward: 49.000, mean reward: 1.000 [1.000, 1.0

  3813/50000: episode: 62, duration: 0.849s, episode steps: 141, steps per second: 166, episode reward: 141.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.364 [-2.040, 0.995], loss: 1.980775, mean_absolute_error: 16.725084, mean_q: 33.954712
  3964/50000: episode: 63, duration: 0.855s, episode steps: 151, steps per second: 177, episode reward: 151.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.404 [-2.423, 0.769], loss: 2.305156, mean_absolute_error: 17.269373, mean_q: 35.032097
  4111/50000: episode: 64, duration: 0.764s, episode steps: 147, steps per second: 193, episode reward: 147.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.456 [0.000, 1.000], mean observation: -0.418 [-2.524, 0.842], loss: 1.507174, mean_absolute_error: 17.780977, mean_q: 36.104591
  4249/50000: episode: 65, duration: 0.946s, episode steps: 138, steps per second: 146, episode reward: 138.000, mean reward: 1

  9088/50000: episode: 91, duration: 1.212s, episode steps: 176, steps per second: 145, episode reward: 176.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.356 [-2.428, 0.832], loss: 3.781599, mean_absolute_error: 32.608410, mean_q: 65.797050
  9268/50000: episode: 92, duration: 1.022s, episode steps: 180, steps per second: 176, episode reward: 180.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.354 [-2.424, 0.842], loss: 3.746078, mean_absolute_error: 32.901924, mean_q: 66.537415
  9417/50000: episode: 93, duration: 0.832s, episode steps: 149, steps per second: 179, episode reward: 149.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.416 [-2.414, 0.819], loss: 4.270190, mean_absolute_error: 33.508572, mean_q: 67.677818
  9600/50000: episode: 94, duration: 1.031s, episode steps: 183, steps per second: 178, episode reward: 183.000, mean reward: 1

 14626/50000: episode: 120, duration: 1.304s, episode steps: 200, steps per second: 153, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.100 [-1.063, 1.073], loss: 9.878106, mean_absolute_error: 39.805923, mean_q: 79.991867
 14825/50000: episode: 121, duration: 1.097s, episode steps: 199, steps per second: 181, episode reward: 199.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.318 [-2.610, 1.111], loss: 11.446517, mean_absolute_error: 40.320412, mean_q: 80.980164
 15010/50000: episode: 122, duration: 0.972s, episode steps: 185, steps per second: 190, episode reward: 185.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.184 [-1.170, 1.859], loss: 6.463132, mean_absolute_error: 40.439072, mean_q: 81.409157
 15171/50000: episode: 123, duration: 0.846s, episode steps: 161, steps per second: 190, episode reward: 161.000, mean rewar

 19928/50000: episode: 149, duration: 1.121s, episode steps: 200, steps per second: 178, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.036 [-1.013, 1.185], loss: 7.538890, mean_absolute_error: 42.041386, mean_q: 84.279358
 20128/50000: episode: 150, duration: 1.085s, episode steps: 200, steps per second: 184, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.298 [-2.260, 1.476], loss: 8.116632, mean_absolute_error: 42.133175, mean_q: 84.499435
 20296/50000: episode: 151, duration: 0.897s, episode steps: 168, steps per second: 187, episode reward: 168.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: 0.216 [-1.115, 1.658], loss: 9.977205, mean_absolute_error: 42.029480, mean_q: 84.230583
 20496/50000: episode: 152, duration: 1.097s, episode steps: 200, steps per second: 182, episode reward: 200.000, mean reward:

 25396/50000: episode: 178, duration: 0.838s, episode steps: 140, steps per second: 167, episode reward: 140.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: 0.337 [-0.968, 1.846], loss: 3.366908, mean_absolute_error: 39.775383, mean_q: 79.896622
 25596/50000: episode: 179, duration: 1.263s, episode steps: 200, steps per second: 158, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.065 [-0.943, 1.302], loss: 6.021750, mean_absolute_error: 39.864349, mean_q: 79.877365
 25796/50000: episode: 180, duration: 1.164s, episode steps: 200, steps per second: 172, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.097 [-1.037, 1.217], loss: 6.447720, mean_absolute_error: 40.016022, mean_q: 80.274689
 25971/50000: episode: 181, duration: 1.379s, episode steps: 175, steps per second: 127, episode reward: 175.000, mean reward:

In [None]:
# Evaluation
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)
print("Testing Done!")

In [None]:
# Exporting
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
print("Saving Done!")