In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Reshape, Conv2D, Dense, Flatten, BatchNormalization, Dropout, MaxPooling2D
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

import gym

In [2]:
env = gym.make('CarRacing-v0')
print(env.observation_space)
print(env.observation_space.shape)
print(env.action_space)
print(env.action_space.shape)

Box(0, 255, (96, 96, 3), uint8)
(96, 96, 3)
Box(-1.0, 1.0, (3,), float32)
(3,)


In [3]:
import numpy as np

class CarRacingDiscrit:

    def __init__(self):
        self.env = gym.make('CarRacing-v0')
        self.action_space = 10*10*10
        self.observation_space = 96*96*3

    def step(self, action):
        v1 = int(     action        ) % 10
        v2 = int( int(action) / 10  ) % 10
        v3 = int( int(action) / 100 ) % 10
        v1 = ( v1 - 5 ) / 5
        v2 = ( v2     ) / 10
        v3 = ( v3     ) / 10
        state, reward, done, info = self.env.step([v1, v2, v3])
        return state, reward, done, info
 
    def seed(self, s):
        return env.seed(s)

    def reset(self):
        return self.env.reset()
    def render(self):
        return self.env.render()

    def close(self):
        return self.env.close()


In [4]:
# Get the environment and extract the number of actions.
env = CarRacingDiscrit()
#env.seed(123)
nb_actions = 10*10*10
print(env.observation_space)
print(env.action_space)

27648
1000


In [6]:
# Next, we build a very simple model.
model = Sequential()
model.add(Reshape((96, 96, 3), input_shape=(1, 96, 96, 3)))
model.add(BatchNormalization())
model.add(Conv2D(filters=32, kernel_size=(3, 3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
model.add(Conv2D(filters=64, kernel_size=(3, 3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(8192, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1000, activation="relu"))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape (Reshape)            (None, 96, 96, 3)         0         
_________________________________________________________________
batch_normalization (BatchNo (None, 96, 96, 3)         12        
_________________________________________________________________
conv2d (Conv2D)              (None, 94, 94, 32)        896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 47, 47, 32)        0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 47, 47, 32)        128       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 45, 45, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 22, 22, 64)        0

In [7]:
# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=5000, window_length=1)
agent = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=nb_actions,
                 nb_steps_warmup=500, target_model_update=1e-2)
agent.compile(Adam(lr=1e-3), metrics=['mse'])

In [8]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=50000, visualize=False, verbose=1, nb_max_episode_steps=2000)

Training for 50000 steps ...
Track generation: 1186..1486 -> 300-tiles track
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
 1000/10000 [==>...........................] - ETA: 51:59 - reward: -0.0799Track generation: 1277..1601 -> 324-tiles track
 2000/10000 [=====>........................] - ETA: 1:05:38 - reward: -0.0760Track generation: 1192..1495 -> 303-tiles track
done, took 1001.120 seconds


<tensorflow.python.keras.callbacks.History at 0x7f3c0fac4ee0>

In [None]:
# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [8]:
# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, nb_max_episode_steps=2000, visualize=False)

Testing for 5 episodes ...
Track generation: 1140..1429 -> 289-tiles track
Episode 1: reward: 7.461, steps: 99
Track generation: 1194..1497 -> 303-tiles track
Episode 2: reward: 6.656, steps: 99
Track generation: 954..1202 -> 248-tiles track
Episode 3: reward: 10.343, steps: 99
Track generation: 1093..1378 -> 285-tiles track
Episode 4: reward: 7.706, steps: 99
Track generation: 1077..1350 -> 273-tiles track
Episode 5: reward: 8.482, steps: 99


<tensorflow.python.keras.callbacks.History at 0x7f8ba805e820>