In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Reshape
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

import gym

In [2]:
env = gym.make('Taxi-v3')
env.render()

+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [3]:
print(env.action_space)
print(env.observation_space)
print(env.action_space.n)
print(env.observation_space.n)

Discrete(6)
Discrete(500)
6
500


In [4]:
state = env.reset()
next_state, reward, done, etc = env.step(1)
print(state)
print(next_state)
print(reward)
print(done)
print(etc)

253
153
-1
False
{'prob': 1.0}


In [5]:
env.P[1]

{0: [(1.0, 101, -1, False)],
 1: [(1.0, 1, -1, False)],
 2: [(1.0, 21, -1, False)],
 3: [(1.0, 1, -1, False)],
 4: [(1.0, 17, -1, False)],
 5: [(1.0, 1, -10, False)]}

In [6]:
model = Sequential()
model.add(Embedding(500, 6, input_length=1))
model.add(Reshape((6,)))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 6)              3000      
_________________________________________________________________
reshape (Reshape)            (None, 6)                 0         
Total params: 3,000
Trainable params: 3,000
Non-trainable params: 0
_________________________________________________________________


In [7]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=5000, window_length=1)
dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=env.action_space.n, nb_steps_warmup=500, target_model_update=1e-2)
dqn.compile(Adam(lr=1e-3), metrics=['mse'])

In [11]:
nb_steps     = 5000
log_interval = 5000
dqn.fit(env, nb_steps=nb_steps, verbose=1, nb_max_episode_steps=99, log_interval=log_interval)

Training for 5000 steps ...
Interval 1 (0 steps performed)
done, took 20.433 seconds


<tensorflow.python.keras.callbacks.History at 0x7f74c464ba60>

In [13]:
dqn.test(env, nb_episodes=5, nb_max_episode_steps=99, visualize=False)

Testing for 5 episodes ...
Episode 1: reward: -99.000, steps: 99
Episode 2: reward: -99.000, steps: 99
Episode 3: reward: -99.000, steps: 99
Episode 4: reward: -99.000, steps: 99
Episode 5: reward: -990.000, steps: 99


<tensorflow.python.keras.callbacks.History at 0x7f7494da4760>