# Assignment 5

This is the starter code for Assignment 5. In this assignment, you will solve increasingly challenging tasks from the [Minigrid benchmark](https://minigrid.farama.org/). You can use any RL algorithm to solve these tasks.

* Problems 1 and 2 are mandatory for all students.
* Problem 3 is mandatory for COMP 552 but optional for COMP 442.
* Problem 4 is optional for all.

In [1]:
!pip install torch
!pip install gymnasium
!pip install minigrid

!pip install "ray[rllib]" tensorflow torch
!pip install stable-baselines3

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting minigrid
  Downloading minigrid-2.3.1-py3-none-any.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.8/103.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: minigrid
Successfully installed minigrid-2.3.1
Collecting ray[rllib]
  Downloading ray-2.8.1-cp310-cp310-manylinux2014_x86_64.whl (62.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboardX>=1.9 (from ray[rllib])
  Downloadi

In [2]:
import gymnasium as gym
import minigrid
import numpy as np
from minigrid.wrappers import *
from stable_baselines3 import PPO

np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})

pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
def compute_score(task, policy):
  num_episodes = 10
  cur_episode  = 0

  seed_by_episode = [42, 34, 50, 1, 9, 7, 43, 56, 90, 11]
  score_by_episode = np.zeros(num_episodes)

  while cur_episode < num_episodes:

    cumulative_reward = 0
    cur_seed = seed_by_episode[cur_episode]

    observation, info = task.reset(seed=cur_seed)
    done = False

    while not done:
      action = policy(observation)
      observation, reward, terminated, truncated, info = task.step(action)
      cumulative_reward += reward

      if terminated or truncated:
        done = True
        score_by_episode[cur_episode] = cumulative_reward
        cur_episode += 1

  score_mean = round(score_by_episode.mean(), 3)
  score_std  = round(score_by_episode.std(), 3)
  score_best = round(score_by_episode.max(), 3)

  print(f"Best score: {score_best}")
  print(f"Average score: {score_mean, score_std}")

  return score_by_episode

  and should_run_async(code)


## Problem 1
Solve the [Minigrid Unlock](https://minigrid.farama.org/environments/minigrid/UnlockEnv/) task.

This problem is worth 5 points.

![](https://minigrid.farama.org/_images/UnlockEnv.gif)

In [4]:
first_task = gym.make("MiniGrid-Unlock-v0")

In [5]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch
import torch.nn as nn

In [6]:
class MinigridFeaturesExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space, features_dim: int = 512, normalized_image: bool = False) -> None:
        super().__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 16, (2, 2)),
            nn.ReLU(),
            nn.Conv2d(16, 32, (2, 2)),
            nn.ReLU(),
            nn.Conv2d(32, 64, (2, 2)),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with torch.no_grad():
            n_flatten = self.cnn(torch.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear(self.cnn(observations))

In [7]:
######## PUT YOUR CODE HERE ########
# Train an agent to solve the task
policy_kwargs = dict(
  features_extractor_class=MinigridFeaturesExtractor,
  features_extractor_kwargs=dict(features_dim=128),
)

env = gym.make("MiniGrid-Unlock-v0", render_mode="rgb_array")
env = ImgObsWrapper(env)

model = PPO("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=1)
model.learn(2e5)
######## PUT YOUR CODE HERE ########

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 288      |
|    ep_rew_mean     | 0        |
| time/              |          |
|    fps             | 646      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 288         |
|    ep_rew_mean          | 0           |
| time/                   |             |
|    fps                  | 318         |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015403754 |
|    clip_fraction        | 0.057       |
|    clip_range      

<stable_baselines3.ppo.ppo.PPO at 0x7d57de893ac0>

In [8]:
def first_policy(observation):
  ######## PUT YOUR CODE HERE ########
  # action = first_task.action_space.sample()

  action = model.predict(observation)[0].item()
  ######## PUT YOUR CODE HERE ########
  # print("OBSERVATION")
  # print(observation, action)
  # print()
  return action

  and should_run_async(code)


In [9]:
model.save('rlhw5p1model')

In [None]:
env.step(env.action_space.sample())

In [90]:
first_task.step(first_task.action_space.sample())

({'image': array([[[0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0]],
  
         [[0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0],
          [0, 0, 0]],
  
         [[2, 5, 0],
          [2, 5, 0],
          [2, 5, 0],
          [2, 5, 0],
          [2, 5, 0],
          [2, 5, 0],
          [2, 5, 0]],
  
         [[2, 5, 0],
          [1, 0, 0],
          [1, 0, 0],
          [1, 0, 0],
          [1, 0, 0],
          [4, 2, 0],
          [5, 2, 0]],
  
         [[2, 5, 0],
          [1, 0, 0],
          [1, 0, 0],
          [1, 0, 0],
          [1, 0, 0],
          [2, 5, 0],
          [1, 0, 0]],
  
         [[2, 5, 0],
          [1, 0, 0],
          [1, 0, 0],
          [1, 0, 0],
          [1, 0, 0],
          [2, 5, 0],
          [1, 0, 0]],
  
         [[2, 5, 0],
          [1, 0, 0],
          [1, 0, 0],
          [1, 0, 0],
 

In [None]:
# model.predict(env.observation_space.sample())[0].item()

In [None]:
# env.action_space.sample()

In [98]:
first_task = gym.make("MiniGrid-Unlock-v0")

In [11]:
compute_score(task=ImgObsWrapper(first_task), policy=first_policy)

Best score: 0.984
Average score: (0.966, 0.011)


array([0.97, 0.95, 0.96, 0.95, 0.97, 0.98, 0.97, 0.96, 0.97, 0.97])

## Problem 2
Solve the [Minigrid Unlock and Pickup](https://minigrid.farama.org/environments/minigrid/UnlockPickupEnv/) task.

This problem is worth 10 points for COMP 442 students.
This problem is worth 05 points for COMP 552 students.

![](https://minigrid.farama.org/_images/UnlockPickupEnv.gif)

In [12]:
second_task = gym.make("MiniGrid-UnlockPickup-v0")

In [14]:
class DoorReward(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        # self.reward_dist_weight = reward_dist_weight
        # self.reward_ctrl_weight = reward_ctrl_weight
        self.unlocked = 0
        self.time = 0
        self.divider = 2000

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.time += 1

        if np.equal(obs[3][5][0], 4).any() and np.equal(obs[3][5][2], 0).any():
          # print("reached unlocked door")
          # print(obs)
          if not self.unlocked:
            reward = (1-.9*(self.time/288))*self.divider
            self.unlocked = 1
        if terminated:
          print("box found")
          self.divider = 1
          reward = (1-.9*(self.time/288))*2*self.divider
        # elif np.equal(obs[3][5][0], 4).any() and np.equal(obs[3][5][2], 2).any():
        #   print("reached locked door")
        #   print(obs)
        #   # print(obs)
        #   reward = .2*2000
        # elif np.equal(obs[3][5][0], 4).any():
        #   reward = 0.1
        # elif np.equal(obs[:, :, 2], 2).any():
        #   # print("door seen")
        #   reward = .00000001
        # else:
        #   reward = reward/2000
        return obs, reward/self.divider, terminated, truncated, info

    def reset(self, **kwargs):
      obs = super().reset(**kwargs)
      self.unlocked = 0
      self.time = 0
      self.divider = 2000
      return obs

In [16]:
######## PUT YOUR CODE HERE ########
# Train an agent to solve the task
# env2 = ImgObsWrapper(gym.make("MiniGrid-Empty-6x6-v0")) # env for the other ones is 6x6
env2 = ImgObsWrapper(second_task)
# env2 = ActionBonus(env2)
env2 = DoorReward(env2)

policy_kwargs2 = dict(
  features_extractor_class=MinigridFeaturesExtractor,
  features_extractor_kwargs=dict(features_dim=128),
)

model2 = PPO("CnnPolicy", env2, policy_kwargs=policy_kwargs2, verbose=1)
model2.set_parameters(model.get_parameters())

model2.learn(2e5)


######## PUT YOUR CODE HERE ########

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 288      |
|    ep_rew_mean     | 0.963    |
| time/              |          |
|    fps             | 482      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 288        |
|    ep_rew_mean          | 0.96       |
| time/                   |            |
|    fps                  | 333        |
|    iterations           | 2          |
|    time_elapsed         | 12         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.07784966 |
|    clip_fraction        | 0.367      |
|    clip_range           | 0.2  

<stable_baselines3.ppo.ppo.PPO at 0x7d57dd643010>

In [20]:
model2.learn(2e5)

box found
box found
box found
box found
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 203      |
|    ep_rew_mean     | 1.58     |
| time/              |          |
|    fps             | 629      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
box found
box found
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 225         |
|    ep_rew_mean          | 1.41        |
| time/                   |             |
|    fps                  | 353         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.022056118 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.674      |
|    explain

<stable_baselines3.ppo.ppo.PPO at 0x7d57dd643010>

In [22]:
model2.save('rlhw5p2model')

In [17]:
def second_policy(observation):
  ######## PUT YOUR CODE HERE ########
  action = model2.predict(observation)[0].item()
  ######## PUT YOUR CODE HERE ########
  return action

In [None]:
# compute_score(task=env2, policy=second_policy)

In [21]:
compute_score(task=ImgObsWrapper(second_task), policy=second_policy)

Best score: 0.931
Average score: (0.769, 0.288)


array([0.88, 0.86, 0.90, 0.92, 0.90, 0.93, 0.92, 0.47, 0.00, 0.91])

## Problem 3
Solve the [Minigrid Blocked, Unlock and Pickup](https://minigrid.farama.org/environments/minigrid/BlockedUnlockPickupEnv/) task.

This problem is optional for COMP 442 students.
This problem is mandatory for COMP 552 students.

This problem is worth 05 points for COMP 552 students.

![](https://minigrid.farama.org/_images/BlockedUnlockPickupEnv.gif)

In [None]:
third_task = gym.make("MiniGrid-BlockedUnlockPickup-v0")

In [None]:
######## PUT YOUR CODE HERE ########
# Train an agent to solve the task

######## PUT YOUR CODE HERE ########

In [None]:
def third_policy(observation):
  ######## PUT YOUR CODE HERE ########
  action = third_task.action_space.sample()
  ######## PUT YOUR CODE HERE ########
  return action

In [None]:
compute_score(task=third_task, policy=third_policy)

## Problem 4
This is a bonus problem and optional for all.

In this problem, the goal is to learn a unified agent for all the three tasks shown above.

In [None]:
######## PUT YOUR CODE HERE ########
# Train an agent to solve the task

######## PUT YOUR CODE HERE ########

In [None]:
def unified_policy(observation):
  ######## PUT YOUR CODE HERE ########
  action = first_task.action_space.sample()
  ######## PUT YOUR CODE HERE ########
  return action

In [None]:
performance_on_first_task = compute_score(task=first_task, policy=unified_policy)

In [None]:
performance_on_second_task = compute_score(task=second_task, policy=unified_policy)

In [None]:
performance_on_third_task = compute_score(task=third_task, policy=unified_policy)

In [None]:
total_performance = np.concatenate((performance_on_first_task, performance_on_second_task, performance_on_third_task), axis=None)

In [None]:
print(f"Average score: {round(total_performance.mean(),3)}")