In [None]:
from __future__ import annotations

import numpy as np
from matplotlib import pyplot as plt

%env SPDLOG_LEVEL=trace
import mcerl
from mcerl.env import Env


In [None]:
test_map = np.ndarray(shape=(100, 100), dtype=np.uint8)
test_map.fill(255)
test_map[20:70, 20:30] = 0
test_map[30:40, 10:50] = 0
test_grid_map = mcerl.GridMap(test_map)
num_agents = 2
agent_poses = [(10, 10), (60, 60)]
num_rays = 32
max_steps = 200
max_steps_per_agent = 40
ray_range = 20
velocity = 1
min_frontier_size = 5
max_frontier_size = 20
env = Env(
    num_agents=num_agents,
    max_steps=max_steps,
    max_steps_per_agent=max_steps_per_agent,
    velocity=velocity,
    sensor_range=ray_range,
    num_rays=num_rays,
    min_frontier_pixel=min_frontier_size,
    max_frontier_pixel=max_frontier_size,
)

In [None]:
def policy(observation):
    action_space = len(observation["frontier_points"])
    if action_space > 0:
        rng = np.random.default_rng()
        return rng.integers(action_space).item() # type: ignore  # noqa: PGH003
    return 0

In [None]:
trajectories = []
frame_data = env.reset(test_map, agent_poses)
trajectories.append(frame_data)
while True:
    agent_id = frame_data['info']['agent_id']
    action_index = policy(frame_data["observation"])
    frame_data["action"] = action_index
    frame_data = env.step(agent_id, action_index)
    trajectories.append(frame_data)
    if env.done() is True:
        break

In [None]:
trajectories

In [None]:
def split_trajectories(trajectories):
    """split trajectory into agent-wise trajectories"""
    agent_trajectories = []
    for agent_id in range(num_agents):
        agent_trajectory = [
            frame_data
            for frame_data in trajectories
            if frame_data["info"]["agent_id"] == agent_id
        ]
        agent_trajectories.append(agent_trajectory)
    return agent_trajectories

In [None]:
def pad_trajectory(trajectory) -> list:
    """
    In this environment, we won't get an observation when done is True.
    However, we need to pad the trajectories to stack them.
    use T-1's observation to pad T, it's ok because we never use this state (normally).
    we also delete those states after done for waiting for remaining agents to finish.
    """
    if len(trajectory) < 2:
        return trajectory
    trajectory_out = []
    for i in range(len(trajectory)):
        trajectory_out.append(trajectory[i])
        if trajectory[i]["done"]:
            trajectory_out[-1]["observation"] = trajectory_out[-2]["observation"]
            break
    return trajectory_out



In [None]:
def refine_trajectory(trajectory):
    """transform the trajectory
    (O_k,I_k,D_k A_k, R_k-1)
    to
    (O_k, I_k,D_k, A_k,
    next(R_k, O_k+1,I_k+1,D_k+1)
    )"""
    refined_trajectory = []
    for i in range(len(trajectory) - 1):
        refined_trajectory.append(
            {
                "observation": trajectory[i]["observation"],
                "info": trajectory[i]["info"],
                "done": trajectory[i]["done"],
                "action": trajectory[i]["action"],
                "next": {
                    "reward": trajectory[i + 1]["reward"],
                    "observation": trajectory[i + 1]["observation"],
                    "info": trajectory[i + 1]["info"],
                    "done": trajectory[i + 1]["done"],
                },
            }
        )
    return refined_trajectory

In [None]:
import tensordict
from tensordict import LazyStackedTensorDict


def stack_trajectory(trajectory):
    """
    stack trajectory to tensordict
    """
    return LazyStackedTensorDict.maybe_dense_stack(
        [tensordict.TensorDict(frame_data) for frame_data in trajectory]
    )

In [None]:
rollouts = split_trajectories(trajectories)
rollouts = [pad_trajectory(rollout) for rollout in rollouts]
rollouts = [refine_trajectory(rollout) for rollout in rollouts]

In [None]:
stacked_rollouts = [stack_trajectory(rollout) for rollout in rollouts]

In [None]:
stacked_rollouts[0][0]["next"]["observation"]