In [1]:
#!pip install flatland-rl
#!pip install tensorforce

In [2]:
from tensorforce import Agent, Environment
from obs_utils import normalize_observation

In [100]:
from flatland.envs.rail_env import RailEnv, RailEnvActions
import numpy as np


class OurEnv(RailEnv):
    def reset(self, *args, render=True, **kwargs):
        observation, info_dict = super().reset(*args, **kwargs)
       #self.env_renderer = RenderTool(env)
        if render:
            self.step({0: RailEnvActions.MOVE_FORWARD})
            render_env(self)
        return observation

class TensorforceEnv(Environment):
    def __init__(self, rail_env, num_agents, state_shape):
        self._rail_env = rail_env
        self.num_agents = num_agents
        state, _ = self._rail_env.reset()
        self.state_shape = state_shape
        super().__init__()
    
    def process_state(self, state):
        return state #np.array(state[0][1]).flatten()

    def process_reward(self, reward):
        reward_sum = 0
        for _, train_reward in reward.items():
            reward_sum += train_reward
        return reward_sum

    def states(self):
        #state = self._rail_env._get_observations()
        #return dict(type='float', shape=(width,height,16))
        #return self._state
        return dict(type="float", min_value=-1000.0, max_value=1000.0, shape=self.state_shape)

    def actions(self):
        return dict(type='int', num_values=len(RailEnvActions), shape=(1,))

    # Optional additional steps to close environment
    def close(self):
        # Maybe render?
        super().close()

    def reset(self):
        state, info = self._rail_env.reset()
        self._state = state
        #state = np.random.random(size=(8,))
        return state

    def execute(self, actions):
        #actions = {index: RailEnvActions(value) for index, value in enumerate(actions)}
        
        state, reward, done, info  = self._rail_env.step(actions)
        
        if not done["__all__"]:
            try:
                state = self.process_state(state)
            except:
                print(state)
            self._state = state
        reward = self.process_reward(reward)
        
        return self._state, done, reward 

In [49]:
from flatland.utils.rendertools import RenderTool
import matplotlib.pyplot as plt

def render_env(env, figsize=(8, 8)):
  """Show the environment using matplotlib"""
  env_renderer = RenderTool(env, gl="PILSVG")
  # img is a numpy array
  img = env_renderer.render_env(show=True, return_image=True)

  plt.figure(figsize=figsize)
  plt.imshow(img)
  plt.show()


In [68]:
from flatland.envs.rail_generators import sparse_rail_generator
from flatland.envs.schedule_generators import sparse_schedule_generator
from flatland.envs.rail_env import RailEnv, RailEnvActions
from flatland.envs.rail_generators import complex_rail_generator
from flatland.envs.observations import GlobalObsForRailEnv, TreeObsForRailEnv

seed = 69 #nice 


width = 10 # @param{type: "integer"}
height = 10 # @param{type: "integer"}
num_agents =  2  # @param{type: "integer"}
tree_depth = 2 # @param{type: "integer"}
radius_observation = 10
WINDOW_LENGTH =   22# @param{type: "integer"}


random_rail_generator = complex_rail_generator(
    nr_start_goal=10, # @param{type:"integer"} number of start and end goals 
                      # connections, the higher the easier it should be for 
                      # the trains
    nr_extra=10, # @param{type:"integer"} extra connections 
                 # (useful for alternite paths), the higher the easier
    min_dist=10,
    max_dist=99999,
    seed=seed
)


env = RailEnv(
    width=width,
    height=height,
    rail_generator=random_rail_generator,
    obs_builder_object=TreeObsForRailEnv(tree_depth),
    number_of_agents=num_agents
)

obs, info = env.reset()

state_shape = normalize_observation(obs[0], tree_depth, radius_observation).shape

environment = TensorforceEnv(env, num_agents, state_shape)

In [7]:
obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
#normalize_observation(obs[0], tree_depth, 1000).shape
obs

{0: Node(dist_own_target_encountered=0, dist_other_target_encountered=0, dist_other_agent_encountered=0, dist_potential_conflict=0, dist_unusable_switch=0, dist_to_next_branch=0, dist_min_to_target=41.0, num_agents_same_direction=0, num_agents_opposite_direction=0, num_agents_malfunctioning=0, speed_min_fractional=1.0, num_agents_ready_to_depart=0, childs={'L': -inf, 'F': Node(dist_own_target_encountered=inf, dist_other_target_encountered=inf, dist_other_agent_encountered=inf, dist_potential_conflict=inf, dist_unusable_switch=inf, dist_to_next_branch=6, dist_min_to_target=35.0, num_agents_same_direction=0, num_agents_opposite_direction=0, num_agents_malfunctioning=0, speed_min_fractional=1.0, num_agents_ready_to_depart=0, childs={'L': -inf, 'F': Node(dist_own_target_encountered=inf, dist_other_target_encountered=inf, dist_other_agent_encountered=inf, dist_potential_conflict=inf, dist_unusable_switch=14, dist_to_next_branch=15, dist_min_to_target=26.0, num_agents_same_direction=0, num_a

In [69]:
from tensorforce.agents import DeepQNetwork

agent = DeepQNetwork.create(
    agent='tensorforce',
    environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
    memory=10000,
    update=dict(unit='timesteps', batch_size=64),
    optimizer=dict(type='adam', learning_rate=3e-4),
    policy=dict(network='auto'),
    objective='policy_gradient',
    reward_estimation=dict(horizon=20)
)

In [9]:
'''         # Reset environment
        obs, info = env.reset(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0



for agent in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent])

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                if next_obs[agent]:
                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=10)

                score += all_rewards[agent]

            if done['__all__']:
                break
 '''

"         # Reset environment\n        obs, info = env.reset(True, True)\n        env_renderer.reset()\n        # Build agent specific observations\n        for a in range(env.get_num_agents()):\n            if obs[a]:\n                agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)\n                agent_obs_buffer[a] = agent_obs[a].copy()\n\n        # Reset score and done\n        score = 0\n        env_done = 0\n\n\n\nfor agent in range(env.get_num_agents()):\n                # Only update the values when we are done or when an action was taken and thus relevant information is present\n                if update_values or done[agent]:\n                    policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent])\n\n                    agent_prev_obs[agent] = agent_obs[agent].copy()\n                    agent_prev_action[agent] = action_dict[agent]\n\n                if next_obs[agent]:\n            

In [101]:
# Train for 300 episodes
for episode in range(300):

    episode_states = list()
    episode_internals = list()
    episode_actions = list()
    episode_terminal = list()
    episode_reward = list()

    # Initialize episode
    states = environment.reset()
    internals = agent.initial_internals()

    terminal = {i: False for i in range(0, num_agents)}
    terminal["__all__"]= False
    #num_updates = 0
    sum_rewards = 0.0
    

    while not terminal["__all__"]:
        actions = {}
        agents_obs = {}
        
        episode_states.append(states)
        episode_internals.append(internals)

        for i in range(0, num_agents):
            if not terminal[i]:
                agents_obs[i] = normalize_observation(states[i], tree_depth, radius_observation)
                actions[i], internals = agent.act(agents_obs[i], internals=internals, independent=True)
                
        episode_actions.append(actions)
        states, terminal, reward = environment.execute(actions=actions)
        
        episode_terminal.append(terminal["__all__"])
        episode_reward.append(reward)
        sum_rewards += reward

    print(episode_terminal)
    print('Episode {}: {}'.format(episode, sum_rewards))
    
    agent.experience(
        states=episode_states, internals=episode_internals, actions=episode_actions,
        terminal=episode_terminal, reward=episode_reward
    )

    # Perform update
    agent.update()
        
agent.close()
environment.close()

[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False

TensorforceError: Calling agent.experience is not possible mid-episode.

In [None]:
from tensorforce.execution import Runner

runner = Runner(agent=agent, environment=environment, max_episode_timesteps=500)

runner.run(num_episodes=5)

runner.run(num_episodes=2, evaluation=True)

runner.close()

Episodes:   0%|          | 0/5 [00:00, reward=0.00, ts/ep=0, sec/ep=0.00, ms/ts=0.0, agent=0.0%]

Episodes:   0%|          | 0/2 [00:00, reward=0.00, ts/ep=0, sec/ep=0.00, ms/ts=0.0, agent=0.0%]

In [None]:
if not all(len(buffer) == 0 for buffer in self.terminal_buffer):
            raise TensorforceError(message="Calling agent.experience is not possible mid-episode.")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c8b2a743-4403-48d9-b1f8-a1215902878c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>