In [1]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm

## Prepare the environment

In [2]:
class FrozenLake:
    def __init__(
        self, 
        max_steps:int=16, 
        is_slippery:bool=True, 
        render:bool=True, 
        custom_reward: bool=True
    ):
        self.max_steps = max_steps
        self.render = render
        self.custom_reward = custom_reward
        self.frozen_lake = gym.make(
            'FrozenLake-v1',
            desc=None,
            map_name="4x4",
            is_slippery=is_slippery,
            render_mode='rgb_array' if render else None
        )

    def generate_episode(
        self, 
        policy: np.ndarray, 
        exploration:bool=True
    ):
        state, _ = self.frozen_lake.reset()
        reward = 0.0
        trajectory = []
        # Render RGB trajectory to observe the states visually
        if self.render:
            render = [self.frozen_lake.render()]
            
        terminated = False
        while not terminated:
            if exploration:
                action = np.random.choice([0, 1, 2, 3], p=policy[state])
            else:
                action = np.argmax(policy[state])
            
            new_state, new_reward, terminated, _, _ = self.frozen_lake.step(action)
            if self.custom_reward:
                x = int(new_state / 4)
                y = new_state % 4
                if (x != 3 or y != 3) and terminated:
                    new_reward = -25
                else:
                    new_reward = -np.sqrt((x-3)**2 + (y-3)**2)

            trajectory.append({'reward':reward, 'state':state, 'action':action})

            # Render RGB trajectory to observe the states visually
            if self.render:
                render.append(self.frozen_lake.render())
                
            reward = new_reward
            state = new_state

            if self.max_steps <= len(trajectory) - 1:
                break

        trajectory.append({'reward':reward})
        if self.render:
            return trajectory, render
        return trajectory

    def generate_video(
        self,
        policy:np.ndarray,
        output_name:str='output.mp4',
        fps:float=1.5
    ):
        _, frames = self.generate_episode(policy=policy, exploration=False)
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        video_writer = cv2.VideoWriter(output_name, fourcc, fps, (256, 256))
        for frame in frames:
            frame = frame[:, :, ::-1]
            video_writer.write(frame)
        video_writer.release()

## On-policy Monte Carlo Control

In [4]:
epsilon = 0.1
num_actions = 4
state_set = 16
gamma = 0.9
render_every_episodes = 1000
episodes = 5000
policy = np.ones([state_set, num_actions], dtype=np.float32) * epsilon / num_actions
policy[:, 0] = 1 - epsilon + epsilon / num_actions 
Q = np.zeros([state_set, num_actions], dtype=np.float32)
N = np.zeros([state_set, num_actions], dtype=np.float32)
mean_G = 0.0
frozen_lake = FrozenLake(is_slippery=False, custom_reward=True)

with tqdm(range(episodes)) as prog:
    for episode in prog:
        trajectory, frames = frozen_lake.generate_episode(policy=policy, exploration=True)
        G = 0.0

        for i in reversed(range(len(trajectory) - 1)):
            reward = trajectory[i + 1]['reward']
            state = trajectory[i]['state']
            action = trajectory[i]['action']
            G = gamma * G + reward

            is_visited = any([t['state'] == state and t['action'] == action for t in trajectory[:i]])
            if not is_visited:
                state, action = int(state), int(action)
                N[state, action] += 1
                Q[state, action] = Q[state, action] + (1 / N[state, action]) * (G - Q[state, action])
                greedy_action = np.argmax(Q[state])
                for a in range(num_actions):
                    if a == greedy_action:
                        policy[state, a] = 1 - epsilon + epsilon / num_actions
                    else:
                        policy[state, a] = epsilon / num_actions

        mean_G = mean_G + (1 / (episode + 1)) * (G - mean_G)
        if episode % render_every_episodes == 0:
            frozen_lake.generate_video(policy, fps=2, output_name=f'output_{episode}.mp4')
            print(f'mean return is {mean_G:>3.2f}')
            mean_G = 0.0

        prog.set_postfix({'G': f'{G:>2.2f}'})

  1%|▎                             | 54/5000 [00:00<00:17, 285.37it/s, G=-26.11]

mean return is -35.35


 21%|█████▉                       | 1027/5000 [00:04<00:17, 226.27it/s, G=-9.95]

mean return is -17.38


 40%|███████████▋                 | 2023/5000 [00:08<00:13, 214.72it/s, G=-9.95]

mean return is -6.36


 61%|█████████████████▋           | 3044/5000 [00:13<00:09, 211.57it/s, G=-9.95]

mean return is -4.12


 80%|███████████████████████▎     | 4023/5000 [00:18<00:04, 211.59it/s, G=-9.95]

mean return is -3.20


100%|████████████████████████████| 5000/5000 [00:22<00:00, 222.91it/s, G=-13.08]
