In [1]:
import gym
import acme
import acme.tf.networks as networks
import acme.agents.tf.r2d2 as r2d2
import dm_env
import functools
import numpy as np
import imageio
import base64
import IPython

In [2]:
level = 'TimePilot-v4' # environment name 
num_episodes = 10_000 # maximum episodes 
max_episode_len = 10_000 # maximum length of one episode  

In [3]:
def make_environment(level, max_episode_len: int = None, evaluation: bool = False) -> dm_env.Environment:
    env = gym.make(level, full_action_space=False) # Create an environment with a standard Gym function
    if max_episode_len is None:
        max_episode_len = 108000 if evaluation else 50000
    return acme.wrappers.wrap_all(env, [ # Wrapper to bring Gym environment to Acme environment 
      acme.wrappers.GymAtariAdapter,
      functools.partial(
          acme.wrappers.AtariWrapper,
          to_float=True,
          max_episode_len=max_episode_len,
          zero_discount_on_life_loss=True,
      ),
      acme.wrappers.SinglePrecisionWrapper, 
      acme.wrappers.ObservationActionRewardWrapper,  # Adds the previous action and reward to the new observation 

  ])

In [4]:
def render(env) -> np.array:
    return env.environment.render(mode='rgb_array')

In [5]:
def display_video(frames, filename=None):
    filename = r'videos/{0}.mp4'.format(filename)
    with imageio.get_writer(filename, fps=10) as video:
        for frame in frames:
            video.append_data(frame)
    video = open(filename, 'rb').read()
    b64_video = base64.b64encode(video)
    video_tag = ('<video  width="640" height="480" controls alt="test" '
               'src="data:video/mp4;base64,{0}">').format(b64_video.decode())
    return IPython.display.HTML(video_tag)

In [6]:
env = make_environment(level, max_episode_len) # Creating an environment 
env_spec = acme.make_environment_spec(env) # get information about the environment (the size of the spaces of observations, actions, rewards...) 

  _RESOLVED_ROMS = _resolve_roms()


In [7]:
# Load the already prepared neural network architecture from Acme 
network = networks.R2D2AtariNetwork(env_spec.actions.num_values)
# env_spec - environment information 
# network - neural network used as agent policy 
# burn_in_length - number of frames on which the hidden LSTM layer is initialized 
# trace_length - number of frames on which LSTM learns  
# длина всей сохраняемой траектории = burn_in_length + trace_length
agent = r2d2.R2D2(env_spec, network, burn_in_length=40, trace_length=40, replay_period=1)

In [8]:
loop = acme.EnvironmentLoop(env, agent)
loop.run(num_episodes=num_episodes) # start the learning loop

INFO:tensorflow:Assets written to: /home/akozhevnikov/acme/394911ee-5454-11ec-9912-87ea4df561ef/snapshots/network/assets


INFO:tensorflow:Assets written to: /home/akozhevnikov/acme/394911ee-5454-11ec-9912-87ea4df561ef/snapshots/network/assets


In [9]:
frames = []
num_steps = max_episode_len
timestep = env.reset() # Сбрасываем состояние среды на изначальное

for _ in range(num_steps):
    frames.append(render(env)) # Save the current frame 
    if timestep.step_type == dm_env.StepType.LAST: # If the game is over - stop 
        break
    action = agent.select_action(timestep.observation) # Agent chooses a new action 
    timestep = env.step(action) #  get a new state from the environment depending on the action of the agent 

In [10]:
display_video(frames, filename='test')

