In [1]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, Flatten, Input, Concatenate
from tensorflow.keras.optimizers import Adam

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

import gym

ENV_NAME = 'CarRacing-v0'

In [2]:
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
env.seed(123)
nb_actions = env.action_space.shape[0]
print(env.observation_space, env.observation_space.shape)
print(env.action_space, env.action_space.shape)

Box(0, 255, (96, 96, 3), uint8) (96, 96, 3)
Box(-1.0, 1.0, (3,), float32) (3,)


In [3]:
# Next, we build a very simple model.
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(8192))
actor.add(Activation('relu'))
actor.add(Dense(4096))
actor.add(Activation('relu'))
actor.add(Dense(1024))
actor.add(Activation('relu'))
actor.add(Dense(nb_actions))
actor.add(Activation('linear'))
actor.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 27648)             0         
_________________________________________________________________
dense (Dense)                (None, 16)                442384    
_________________________________________________________________
activation (Activation)      (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0

In [4]:
action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(8192)(x)
x = Activation('relu')(x)
x = Dense(4096)(x)
x = Activation('relu')(x)
x = Dense(1024)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
critic.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observation_input (InputLayer)  [(None, 1, 96, 96, 3 0                                            
__________________________________________________________________________________________________
action_input (InputLayer)       [(None, 3)]          0                                            
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 27648)        0           observation_input[0][0]          
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 27651)        0           action_input[0][0]               
                                                                 flatten_1[0][0]       

In [5]:
# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

Adam


In [6]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=50000, visualize=False, verbose=1, nb_max_episode_steps=200)

Training for 50000 steps ...
Track generation: 1234..1551 -> 317-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1225..1535 -> 310-tiles track
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
  199/10000 [..............................] - ETA: 3:29 - reward: -0.0349Track generation: 1172..1469 -> 297-tiles track
  399/10000 [>.............................] - ETA: 4:08 - reward: -0.0252Track generation: 1063..1336 -> 273-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1151..1443 -> 292-tiles track
  599/10000 [>.............................] - ETA: 4:16 - reward: -0.0215Track generation: 1365..1709 -> 344-tiles track
  800/10000 [=>............................] - ETA: 4:18 - reward: -0.0339Track generation: 1177..1485 -> 308-tiles track
  999/10000 [=>................

In [None]:
# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [9]:
# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=200)

Testing for 5 episodes ...
Episode 1: reward: -122.348, steps: 200
Episode 2: reward: -357.962, steps: 200
Episode 3: reward: -120.219, steps: 200
Episode 4: reward: -116.302, steps: 200
Episode 5: reward: -127.116, steps: 200


<tensorflow.python.keras.callbacks.History at 0x7fec504a7580>