Code entspricht dem Beipsiel der gymnasium dokumentation für das Erstellen einer  Gridworld: https://gymnasium.farama.org/tutorials/gymnasium_basics/environment_creation/
Mehr Erläuterungen in "RenderingGymnasiumTutorial Notes".
TODO: render function is only considering human rendering; unclear what RGB-rendering does

In [1]:
import numpy as np
import pygame

import gymnasium as gym
from gymnasium import spaces


class GridWorldEnv(gym.Env):
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}

    def __init__(self, render_mode=None, size=8):
        self.size = size  # The size of the square grid
        self.window_size = 512  # The size of the PyGame window

        # Observations are dictionaries with the agent's and the target's location.
        # Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).
        self.observation_space = spaces.Dict(
            {
                "agent": spaces.Box(0, size - 1, shape=(2,), dtype=int),
                "target": spaces.Box(0, size - 1, shape=(2,), dtype=int),
            }
        )

        # We have 4 actions, corresponding to "right", "up", "left", "down"
        self.action_space = spaces.Discrete(4)

        """
        The following dictionary maps abstract actions from `self.action_space` to
        the direction we will walk in if that action is taken.
        I.e. 0 corresponds to "right", 1 to "up" etc.
        """
        self._action_to_direction = {
            0: np.array([1, 0]),
            1: np.array([0, 1]),
            2: np.array([-1, 0]),
            3: np.array([0, -1]),
        }

        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode

        """
        If human-rendering is used, `self.window` will be a reference
        to the window that we draw to. `self.clock` will be a clock that is used
        to ensure that the environment is rendered at the correct framerate in
        human-mode. They will remain `None` until human-mode is used for the
        first time.
        """
        self.window = None
        self.clock = None

    def _get_obs(self):
        return {"agent": self._agent_location, "target": self._target_location}

    def _get_info(self):
        return {
        "distance": np.linalg.norm(
            self._agent_location - self._target_location, ord=1
        )
    }

    def reset(self, seed=None, options=None):
    # We need the following line to seed self.np_random
        super().reset(seed=seed)

        # Choose the agent's location uniformly at random
        self._agent_location = self.np_random.integers(0, self.size, size=2, dtype=int)

        # We will sample the target's location randomly until it does not coincide with the agent's location
        self._target_location = self._agent_location
        while np.array_equal(self._target_location, self._agent_location):
            self._target_location = self.np_random.integers(
                0, self.size, size=2, dtype=int
            )

        observation = self._get_obs()
        info = self._get_info()

        if self.render_mode == "human":
            self._render_frame()

        return observation, info
    
    def step(self, action):
        # Map the action (element of {0,1,2,3}) to the direction we walk in
        direction = self._action_to_direction[action]
        # We use `np.clip` to make sure we don't leave the grid
        self._agent_location = np.clip(
            self._agent_location + direction, 0, self.size - 1
        )
        # An episode is done iff the agent has reached the target
        terminated = np.array_equal(self._agent_location, self._target_location)
        reward = 1 if terminated else 0  # Binary sparse rewards
        observation = self._get_obs()
        info = self._get_info()

        if self.render_mode == "human":
            self._render_frame()

        return observation, reward, terminated, info
    
    def render(self, mode=None):
        if self.render_mode == "rgb_array":
         return self._render_frame()

    def _render_frame(self):
        if self.window is None:
            pygame.init()
            pygame.display.init()
            self.window = pygame.display.set_mode(
                (self.window_size, self.window_size)
            )
        if self.clock is None:
            self.clock = pygame.time.Clock()

        canvas = pygame.Surface((self.window_size, self.window_size))
        canvas.fill((0, 0, 255))
        pix_square_size = (
            self.window_size / self.size
        )  # The size of a single grid square in pixels

        # First we draw the target
        pygame.draw.rect(
            canvas,
            (128, 128, 128),
            pygame.Rect(
                (0, 0),
                (self.window_size, pix_square_size),
            ),
        )

        pygame.draw.rect(
            canvas,
            (128, 128, 128),
            pygame.Rect(
                (0, self.window_size-pix_square_size),
                (self.window_size, pix_square_size),
            ),
        )

        pygame.draw.rect(
            canvas,
            (139, 69, 19),
            pygame.Rect(
                ((self.size/2)*pix_square_size-pix_square_size, pix_square_size),
                (pix_square_size*2, self.window_size-2*pix_square_size),
            ),
        )

        # Now we draw the agent
        pygame.draw.circle(
            canvas,
            (255, 255, 255),
            (self._agent_location + 0.5) * pix_square_size,
            pix_square_size / 3,
        )

        # Finally, add some gridlines
        for x in range(self.size + 1):
            pygame.draw.line(
                canvas,
                0,
                (0, pix_square_size * x),
                (self.window_size, pix_square_size * x),
                width=3,
            )
            pygame.draw.line(
                canvas,
                0,
                (pix_square_size * x, 0),
                (pix_square_size * x, self.window_size),
                width=3,
            )

        # The following line copies our drawings from `canvas` to the visible window
        self.window.blit(canvas, canvas.get_rect())
        pygame.event.pump()
        pygame.display.update()

        # We need to ensure that human-rendering occurs at the predefined framerate.
        # The following line will automatically add a delay to keep the framerate stable.
        self.clock.tick(self.metadata["render_fps"])

    def close(self):
        if self.window is not None:
            pygame.display.quit()
            pygame.quit()

pygame 2.1.3 (SDL 2.0.22, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


zum Umgang mit spaces und dem Update von agent und target position im Observation Space (impliziert vollständige Beobachtbarkeit des Environments)

In [2]:
import gym

size = 5  # Replace with the desired size
observation_space = gym.spaces.Dict(
    {
        "agent": gym.spaces.Box(0, size - 1, shape=(2,), dtype=int),
        "target": gym.spaces.Box(0, size - 1, shape=(2,), dtype=int),
    }
)

# Retrieving the value associated with the "agent" key
agent_space = observation_space["agent"]

print(observation_space["agent"])

observation_space["agent"] = np.random.default_rng().integers(0, size, size=2, dtype=int)

print(type(observation_space["agent"]))

Box(0, 4, (2,), int64)
<class 'numpy.ndarray'>


In this case, the entry_point is set to a lambda function that creates an instance of your CustomEnv class. This approach is necessary because the Jupyter environment does not directly correspond to a Python module, and the traditional module-based registration may not work as expected.

In [3]:
gym.register(
    id='grid_world-v0',
    entry_point=lambda render_mode=None: GridWorldEnv(render_mode=render_mode),  # Replace with your actual module and class name
    max_episode_steps=300,
    kwargs={'render_mode': None}
)


In [4]:
print("start")
env = gym.make('grid_world-v0', render_mode='human')

start


In [5]:
env.observation_space.sample()

OrderedDict([('agent', array([0, 6])), ('target', array([0, 3]))])

In [6]:
episodes = 10
env.render(mode="human")
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    
    while not done:
        action = env.action_space.sample()
        print(action)
        state, reward, done, info = env.step(action)
        print(state)

2
{'agent': array([6, 3]), 'target': array([0, 3])}
3
{'agent': array([6, 2]), 'target': array([0, 3])}
3
{'agent': array([6, 1]), 'target': array([0, 3])}
1
{'agent': array([6, 2]), 'target': array([0, 3])}
1
{'agent': array([6, 3]), 'target': array([0, 3])}
3
{'agent': array([6, 2]), 'target': array([0, 3])}
0
{'agent': array([7, 2]), 'target': array([0, 3])}
1
{'agent': array([7, 3]), 'target': array([0, 3])}
0
{'agent': array([7, 3]), 'target': array([0, 3])}
0
{'agent': array([7, 3]), 'target': array([0, 3])}
2
{'agent': array([6, 3]), 'target': array([0, 3])}
2
{'agent': array([5, 3]), 'target': array([0, 3])}
3
{'agent': array([5, 2]), 'target': array([0, 3])}
3
{'agent': array([5, 1]), 'target': array([0, 3])}
1
{'agent': array([5, 2]), 'target': array([0, 3])}
0
{'agent': array([6, 2]), 'target': array([0, 3])}
1
{'agent': array([6, 3]), 'target': array([0, 3])}
2
{'agent': array([5, 3]), 'target': array([0, 3])}
0
{'agent': array([6, 3]), 'target': array([0, 3])}
2
{'agent': 

KeyboardInterrupt: 

computes the manhatten distance between the agent and the target: