# Findgoal Environment 

> Basic scenario where agents need to find and reach a goal in the grid world. 

In [None]:
#| default_exp envs.marl_grid.findgoal

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from fastcore import *
from fastcore.utils import *

In [None]:
#| export
import numpy as np

from mawm.envs.marl_grid.base import MultiGridEnv, MultiGrid
from mawm.envs.marl_grid.objects import Goal, Wall


def dis_func(x, y, k=1):
    return np.linalg.norm(x - y) / k



In [None]:
#| export
import gymnasium as gym
from typing import Optional 

class FindGoalMultiGrid(MultiGridEnv):
    """
    A single cluttered room with a green goal at random position.
    Each agent obtains a reward when reaching the goal.
    All agents must be reach the goal to obtain a team reward.
    """
    mission = 'get to the green square'
    metadata = {}

    def __init__(self, config):
        n_clutter = config.get('n_clutter')
        clutter_density = config.get('clutter_density')
        randomize_goal = config.get('randomize_goal')
        self.spawn_without_goal_view = config.get('spawn_without_goal_view', True)
        self.min_goal_spawn_distance = config.get('min_goal_spawn_distance', 7)

        if (n_clutter is None) == (clutter_density is None):
            raise ValueError('Must provide n_clutter or clutter_density.')

        super().__init__(config)

        if clutter_density is not None:
            self.n_clutter = int(
                clutter_density * (self.width - 2) * (self.height - 2))
        else:
            self.n_clutter = n_clutter

        self.randomize_goal = randomize_goal

    def _gen_grid(self, width, height):
        self.grid = MultiGrid((width, height))
        self.grid.wall_rect(0, 0, width, height)

        if getattr(self, 'randomize_goal', True):
            goal_pos = self.place_obj(Goal(color='green', reward=1),
                                      max_tries=100)
        else:
            goal_pos = np.asarray([width - 2, height - 2])
            self.put_obj(Goal(color='green', reward=1), width - 2, height - 2)

        for _ in range(getattr(self, 'n_clutter', 0)):
            self.place_obj(Wall(), max_tries=100)

        return goal_pos

    def gen_global_obs(self, agent_done=None):
        if agent_done is None:
            # an integer array storing agent's done info
            agent_done = np.zeros((len(self.agents, )), dtype=float)
        self.sees_goal = np.array([self.agents[i].in_view(
                self.goal_pos[0], self.goal_pos[1]) for i in range(
                self.num_agents)]) * 1

        obs = {
            'adv_indices': self.adv_indices,
            'agent_done': agent_done,  # (N,)
            'goal_pos': self.goal_pos,  # (2,)
            'sees_goal': self.sees_goal,  # (N,)
            'pos': np.stack([self.get_agent_pos(a) for a in self.agents],
                            axis=0),  # (N, 2)
            'comm_act': np.stack([a.comm for a in self.agents],
                                 axis=0),  # (N, comm_len)
            'env_act': np.stack([a.env_act for a in self.agents],
                                axis=0),  # (N, 1)
        }
        return obs

    # def reset(self, seed: int = None, options: dict = None):
    #     obs_dict = MultiGridEnv.reset(self, seed=seed, options=options)

    #     if self.num_adversaries < 0:
    #         # need to count number of adversaries in the env
    #         self.adv_indices = set()
    #         for i, agent in enumerate(self.agents):
    #             if agent.is_adversary:
    #                 self.adv_indices.add(i)
    #         self.num_adversaries = len(self.adv_indices)

    #         obs_dict['global'] = self.gen_global_obs()
    #         return obs_dict

    #     else:
    #         # randomize adv indices each episode
    #         adv_indices = np.random.choice([i for i in range(self.num_agents)],
    #                                        self.num_adversaries,
    #                                        replace=False)
    #         for i, agent in enumerate(self.agents):
    #             if i in adv_indices:
    #                 agent.is_adversary = True
    #             else:
    #                 agent.is_adversary = False
    #         self.adv_indices = adv_indices

    #         obs_dict['global'] = self.gen_global_obs()
    #         return obs_dict

    # def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
    #     """
    #     Modified reset that ensures agents don't spawn with goal visible.
    #     """
    #     # Do the parent's agent reset logic manually
    #     for agent in self.agents:
    #         agent.agents = []
    #         agent.reset(new_episode=True)

    #     # Generate grid and goal (this is done in parent's reset)
    #     self.goal_pos = self._gen_grid(self.width, self.height)
        
    #     # Define rejection function based on config
    #     if self.spawn_without_goal_view:
    #         def reject_spawn_fn(pos):
    #             """Reject positions too close to goal (goal might be visible)"""
    #             dist = abs(pos[0] - self.goal_pos[0]) + abs(pos[1] - self.goal_pos[1])
    #             return dist < self.min_goal_spawn_distance
    #     else:
    #         reject_spawn_fn = None
        
    #     # Place agents with rejection function
    #     for agent in self.agents:
    #         if agent.spawn_delay == 0:
    #             self.place_obj(
    #                 agent, 
    #                 reject_fn=reject_spawn_fn,
    #                 max_tries=1000,  # Increase tries since we're constraining placement
    #                 **self.agent_spawn_kwargs
    #             )
    #             agent.activate()

    #     self.step_count = 0
    #     obs = self.gen_obs()
    #     obs_dict = {f'agent_{i}': obs[i] for i in range(len(obs))}
        
    #     # Your existing adversary logic
    #     if self.num_adversaries < 0:
    #         # need to count number of adversaries in the env
    #         self.adv_indices = set()
    #         for i, agent in enumerate(self.agents):
    #             if agent.is_adversary:
    #                 self.adv_indices.add(i)
    #         self.num_adversaries = len(self.adv_indices)
    #     else:
    #         # randomize adv indices each episode
    #         adv_indices = np.random.choice(
    #             [i for i in range(self.num_agents)],
    #             self.num_adversaries,
    #             replace=False
    #         )
    #         for i, agent in enumerate(self.agents):
    #             if i in adv_indices:
    #                 agent.is_adversary = True
    #             else:
    #                 agent.is_adversary = False
    #         self.adv_indices = adv_indices
        
    #     obs_dict['global'] = self.gen_global_obs()
    #     return obs_dict
    
    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
        """
        Override reset to add goal visibility constraint during agent placement.
        
        This replicates MultiGridEnv.reset() but adds a rejection function
        for agent placement.
        """
        # Only call gym.Env.reset() for proper seeding (not MultiGridEnv.reset())
        # gym.Env.reset() just handles seeding and returns empty info
        gym.Env.reset(self, seed=seed)
        
        # Now do MultiGridEnv's reset logic with our modifications
        for agent in self.agents:
            agent.agents = []
            agent.reset(new_episode=True)

        # Generate grid and goal
        self.goal_pos = self._gen_grid(self.width, self.height)
        
        # Define rejection function if enabled
        if self.spawn_without_goal_view:
            def reject_spawn_fn(pos):
                """Reject positions too close to goal (where goal might be visible)"""
                dist = abs(pos[0] - self.goal_pos[0]) + abs(pos[1] - self.goal_pos[1])
                return dist < self.min_goal_spawn_distance
        else:
            reject_spawn_fn = None
        
        # Place agents with our custom rejection function
        for agent in self.agents:
            if agent.spawn_delay == 0:
                self.place_obj(
                    agent,
                    reject_fn=reject_spawn_fn,  # Add rejection function
                    max_tries=1000,  # Increase max tries
                    **self.agent_spawn_kwargs
                )
                agent.activate()

        self.step_count = 0
        obs = self.gen_obs()
        obs_dict = {f'agent_{i}': obs[i] for i in range(len(obs))}
        
        # FindGoalMultiGrid-specific: handle adversaries
        if self.num_adversaries < 0:
            # Count number of adversaries in the env
            self.adv_indices = set()
            for i, agent in enumerate(self.agents):
                if agent.is_adversary:
                    self.adv_indices.add(i)
            self.num_adversaries = len(self.adv_indices)
        else:
            # Randomize adversary indices each episode
            adv_indices = np.random.choice(
                [i for i in range(self.num_agents)],
                self.num_adversaries,
                replace=False
            )
            for i, agent in enumerate(self.agents):
                if i in adv_indices:
                    agent.is_adversary = True
                else:
                    agent.is_adversary = False
            self.adv_indices = adv_indices
        
        obs_dict['global'] = self.gen_global_obs()
        return obs_dict
    
    
    def _get_reward(self, rwd, agent_no):
        step_rewards = np.zeros((len(self.agents, )), dtype=float)
        env_rewards = np.zeros((len(self.agents, )), dtype=float)
        if agent_no in self.adv_indices:
            # agent can only receive rewards if it is non-adversarial
            return env_rewards, step_rewards

        env_rewards[agent_no] += rwd
        if self.team_reward_type == 'share':
            # assign zero-sum rewards to both teams
            for agent_id in range(self.num_agents):
                if agent_id not in self.adv_indices:

                    step_rewards[agent_id] += rwd
                    self.agents[agent_id].reward(rwd)
                else:
                    step_rewards[agent_id] -= rwd
                    self.agents[agent_id].reward(-rwd)
        else:
            step_rewards[agent_no] += rwd
            self.agents[agent_no].reward(rwd)
        return env_rewards, step_rewards

    def update_reward(self, step_rewards):
        nonadv_done_n = []
        adv_rew = 0.0
        for i, agent in enumerate(self.agents):
            if i not in self.adv_indices:
                # zero-sum reward between adversaries and non-adversaries
                nonadv_done_n.append(agent.done)
                adv_rew -= step_rewards[i]
        nonadv_done = all(nonadv_done_n)

        timeout = (self.step_count >= self.max_steps)

        # normalized distance-to-goal to range [0, 1]
        ndis_to_goal = [dis_func(agent.pos, self.goal_pos, k=self.max_dis)
                        for agent in self.agents]

        if self.team_reward_type == 'const':
            # give constant team reward to non-adversaries
            if nonadv_done:
                team_rwd = self.team_reward_multiplier
                for i, a in enumerate(self.agents):
                    if i not in self.adv_indices:
                        a.reward(team_rwd)
                        step_rewards[i] += team_rwd

                        # keep zero-sum reward between
                        # adversaries and non-adversaries
                        adv_rew -= team_rwd
        else:
            # no team reward
            pass

        if len(self.adv_indices) > 0:
            adv_rew /= len(self.adv_indices)
            for i in self.adv_indices:
                step_rewards[i] += adv_rew
        return timeout, nonadv_done, step_rewards, ndis_to_goal

    def step(self, action_dict):
        obs_dict, rew_dict, _, info_dict = MultiGridEnv.step(self, action_dict)
        if self.active_after_done:
            done_n = [agent.at_pos(self.goal_pos) for agent in self.agents]
        else:
            done_n = [agent.done for agent in self.agents]

        step_rewards = rew_dict['step_rewards']
        env_rewards = rew_dict['env_rewards']
        comm_rewards = rew_dict['comm_rewards']
        comm_strs = info_dict['comm_strs']

        timeout, nonadv_done, step_rewards, ndis_to_goal = self.update_reward(
            step_rewards)

        # The episode overall is done if ALL non-adversarial agents are done,
        # or if it exceeds the step limit.
        done = timeout or nonadv_done
        if self.debug:
            done = any(done_n)

        step_rewards += comm_rewards

        rew_dict = {f'agent_{i}': step_rewards[i] for i in range(
            len(step_rewards))}
        done_dict = {'__all__': done}
        info_dict = {f'agent_{i}': {
            'done': done_n[i],
            'comm': self.agents[i].comm,
            'nonadv_done': nonadv_done,
            'posd': np.array([self.agents[i].pos[0], self.agents[i].pos[1],
                              done_n[i]]),
            'sees_goal': self.sees_goal[i],
            'comm_str': comm_strs[i],
        } for i in range(len(done_n))}

        info_dict['rew_by_act'] = {
            # env reward
            0: {f'agent_{i}': env_rewards[i] for i in range(len(env_rewards))},

            # designed comm reward
            'comm': {f'agent_{i}': comm_rewards[i] for i in range(len(
                comm_rewards))},
        }

        # team reward
        if self.separate_rew_more:
            info_dict['rew_by_act'][1] = {f'agent_{i}': (
                    step_rewards[i] - env_rewards[i]) for i in range(
                len(step_rewards))}
        else:
            info_dict['rew_by_act'][1] = {f'agent_{i}': (
                step_rewards[i]) for i in range(len(step_rewards))}

        obs_dict['global'] = self.gen_global_obs()
        return obs_dict, rew_dict, done_dict, info_dict


In [None]:
#| export
@patch
def get_goal(self:FindGoalMultiGrid, agent, goal_pos, direction=None):
    """
    Generate the observation the agent would see one step BEFORE reaching the goal.
    
    Args:
        agent: The agent object
        goal_pos: np.arrray[(gx, gy)] of goal position
        direction: Optional specific direction to approach from (0=right, 1=down, 2=left, 3=up)
                   If None, tries all 4 directions and returns the first valid one
    
    Returns:
        obs: The observation image, or None if goal is unreachable
        approach_dir: The direction used to approach the goal
    """
    # Unwrap environment if needed
    # print(type(self))
    if hasattr(self, 'env'):
        env = self.env
    else:
        env = self
    
    gx, gy = goal_pos[0], goal_pos[1]
    
    # Save original agent state
    old_pos = agent.pos.copy() if agent.pos is not None else None
    old_dir = agent.dir
    
    # Direction vectors: 0=right, 1=down, 2=left, 3=up
    dir_vecs = {
        0: np.array([1, 0]),   # right
        1: np.array([0, 1]),   # down
        2: np.array([-1, 0]),  # left
        3: np.array([0, -1]),  # up
    }
    
    # If direction specified, try only that one; otherwise try all 4
    directions_to_try = [direction] if direction is not None else [0, 1, 2, 3]
    
    for try_dir in directions_to_try:
        # Compute position one step before goal when approaching from this direction
        # If agent is at pre_pos facing try_dir, moving forward reaches the goal
        fv = dir_vecs[try_dir]
        goal_pre_pos = np.array([gx, gy]) - fv
        
        # Check if this position is valid (inside grid)
        if (goal_pre_pos[0] < 0 or goal_pre_pos[0] >= env.width or
            goal_pre_pos[1] < 0 or goal_pre_pos[1] >= env.height):
            continue
        
        # Check if this position is walkable (not a wall)
        cell_at_pre_pos = env.grid.get(*goal_pre_pos)
        if cell_at_pre_pos is not None and not cell_at_pre_pos.can_overlap():
            continue  # Can't stand here (wall or blocking object)
        
        # Valid position found! Temporarily place agent here
        try:
            # Remove agent from old position (if it was placed)
            if old_pos is not None:
                old_cell = env.grid.get(*old_pos)
                if old_cell == agent:
                    env.grid.set(*old_pos, None)
                elif old_cell is not None and hasattr(old_cell, 'agents'):
                    if agent in old_cell.agents:
                        old_cell.agents.remove(agent)
            
            # Place agent at pre-goal position facing the goal
            agent.pos = goal_pre_pos
            agent.dir = try_dir
            
            # Handle if there's already something at this position
            if cell_at_pre_pos is not None and cell_at_pre_pos.can_overlap():
                # Temporarily add agent to this cell's agent list
                if not hasattr(cell_at_pre_pos, 'agents'):
                    cell_at_pre_pos.agents = []
                cell_at_pre_pos.agents.append(agent)
                placed_in_agents = True
            else:
                # Place agent directly
                env.grid.set(*goal_pre_pos, agent)
                placed_in_agents = False
            
            # Generate observation
            obs = env.gen_agent_obs(agent, image_only=True)
            
            # Restore agent to original position
            if placed_in_agents:
                cell_at_pre_pos.agents.remove(agent)
            else:
                env.grid.set(*goal_pre_pos, None)
            
            if old_pos is not None:
                env.grid.set(*old_pos, agent)
                agent.pos = old_pos
            else:
                agent.pos = None
            agent.dir = old_dir
            
            return obs, try_dir
            
        except Exception as e:
            # Restore state on error
            if old_pos is not None:
                env.grid.set(*old_pos, agent)
                agent.pos = old_pos
            else:
                agent.pos = None
            agent.dir = old_dir
            raise e
    
    # No valid approach direction found
    print(f"Warning: Goal at ({gx}, {gy}) is unreachable - surrounded by walls")
    return None, None



In [None]:
#| export
import cv2
@patch
def get_layout(self: FindGoalMultiGrid, video_scale = 8, render_kwargs={}):
    for agent in self.agents:
        agent.active = False
    layout = self.render(mode="rgb_array", show_more=True, show_agent_views= False,
                                        **render_kwargs)

    if isinstance(layout, list) or len(layout.shape) > 3:
        layout = layout[0]

    if video_scale != 1:
        layout = cv2.resize(layout, None,
                                fx=video_scale,
                                fy=video_scale,
                                interpolation=cv2.INTER_AREA)

    for agent in self.agents:
        agent.active = True
    return layout

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()