In [1]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy

import gym
ENV_NAME = 'Taxi-v3'

In [2]:
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
env.seed(123)
action_space = env.action_space.n
state_space = env.observation_space.n
print(env.observation_space, env.observation_space.n)
print(env.action_space, env.action_space.n)

Discrete(500) 500
Discrete(6) 6


In [3]:
# Next, we build a very simple model.
model = Sequential()
model.add(Embedding(state_space, action_space, input_length=1))
model.add(Reshape((action_space,)))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 6)              3000      
_________________________________________________________________
reshape (Reshape)            (None, 6)                 0         
Total params: 3,000
Trainable params: 3,000
Non-trainable params: 0
_________________________________________________________________


In [4]:
# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy() #BoltzmannQPolicy()
agent = DQNAgent(model=model, policy=policy, memory=memory, nb_actions=action_space,
                 nb_steps_warmup=500, target_model_update=1e-2)
agent.compile(Adam(lr=1e-3), metrics=['mae'])

In [5]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=500, log_interval=500, visualize=False, verbose=1, nb_max_episode_steps=100)

Training for 500 steps ...
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
done, took 0.942 seconds


<tensorflow.python.keras.callbacks.History at 0x7f4d2851a670>

In [6]:
# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [7]:
# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=100)

Testing for 5 episodes ...
Episode 1: reward: -100.000, steps: 100
Episode 2: reward: -100.000, steps: 100
Episode 3: reward: -100.000, steps: 100
Episode 4: reward: -1000.000, steps: 100
Episode 5: reward: -100.000, steps: 100


<tensorflow.python.keras.callbacks.History at 0x7f4d2858b520>