In [32]:
import numpy as np
import gym
import random

In [33]:
ENV_NAME = "Taxi-v3"
env = gym.make(ENV_NAME)
env.reset()
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[34;1mB[0m: |
+---------+



In [34]:
print("Number of actions: %d" % env.action_space.n)
print("Number of states: %d" % env.observation_space.n)

Number of actions: 6
Number of states: 500


In [35]:
action_size = env.action_space.n
state_size = env.observation_space.n

np.random.seed(123)
env.seed(123)

[123]

In [36]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Embedding, Reshape
from keras.optimizer_v1 import Adam
Adam._name = 'hey'

In [37]:
env.reset()
env.step(env.action_space.sample())[0]

241

In [38]:
import keras
model = keras.Sequential()
model.add(Embedding(500, 10, input_length=1))
model.add(Reshape((10,)))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(action_size, activation='linear'))
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 1, 10)             5000      
                                                                 
 reshape_5 (Reshape)         (None, 10)                0         
                                                                 
 dense_12 (Dense)            (None, 50)                550       
                                                                 
 dense_13 (Dense)            (None, 50)                2550      
                                                                 
 dense_14 (Dense)            (None, 50)                2550      
                                                                 
 dense_15 (Dense)            (None, 6)                 306       
                                                                 
Total params: 10,956
Trainable params: 10,956
Non-trai

In [39]:
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=action_size, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=1000000, visualize=False, verbose=1, nb_max_episode_steps=99, log_interval=100000)

Training for 1000000 steps ...
Interval 1 (0 steps performed)
1148 episodes - episode_reward: -113.655 [-855.000, 15.000] - loss: 3.821 - mae: 24.304 - mean_q: -23.987 - prob: 1.000

Interval 2 (100000 steps performed)
2460 episodes - episode_reward: -37.005 [-486.000, 15.000] - loss: 2.979 - mae: 18.985 - mean_q: -12.820 - prob: 1.000

Interval 3 (200000 steps performed)
6764 episodes - episode_reward: 2.223 [-135.000, 15.000] - loss: 0.104 - mae: 7.167 - mean_q: 12.262 - prob: 1.000

Interval 4 (300000 steps performed)
6757 episodes - episode_reward: 2.269 [-100.000, 15.000] - loss: 0.002 - mae: 7.364 - mean_q: 12.694 - prob: 1.000

Interval 5 (400000 steps performed)
6771 episodes - episode_reward: 2.291 [-126.000, 15.000] - loss: 0.003 - mae: 7.357 - mean_q: 12.684 - prob: 1.000

Interval 6 (500000 steps performed)
6794 episodes - episode_reward: 2.398 [-50.000, 15.000] - loss: 0.001 - mae: 7.365 - mean_q: 12.714 - prob: 1.000

Interval 7 (600000 steps performed)
6818 episodes - ep

<keras.callbacks.History at 0x7ff6afbc75e0>

In [40]:
dqn.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=99)

Testing for 5 episodes ...
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: |[43m [0m: :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | :[43m [0m:[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[42mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : | : :[42m_[0m|
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+


<keras.callbacks.History at 0x7ff6cc1c83d0>

In [41]:
dqn.save_weights('dqn_{}_weights.h5f'.format("Taxi-v3"), overwrite=True)