In [74]:
import gymnasium as gym
from gymnasium import spaces
from collections import defaultdict
import numpy as np

class NegativeHoleRewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)

    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        # The base FrozenLake environment gives a reward of 1 for reaching the goal, and 0 otherwise.
        # It also sets the 'terminated' flag to True when either the goal is reached or a hole is fallen into.
        # We can check if the episode terminated, but the reward was 0, which indicates a hole.
        if reward == 0 and terminated:
            reward = -1  # Return a negative reward for falling in a hole
        
        return observation, reward, terminated, truncated, info  # Otherwise, return the original reward (1 for goal, 0 otherwise)

env = gym.make('FrozenLake-v1', desc=["SFFF", "FHFH", "FFFH", "HFFG"], is_slippery=False)
wrapped_env = NegativeHoleRewardWrapper(env)
observation, info = wrapped_env.reset(seed=42)

In [107]:
successes = 0

def greedy(Q_func, S):
    return np.argmax(Q_func[S])
    
def epsilon_greedy(Q_func, S, epsilon):
    r = np.random.random()
    if r > epsilon:
        return np.argmax(Q_func[S])
    else:
        return np.random.choice([0, 1, 2, 3])
        
def generate_episode(policy, _env, Q_func, N_s):
    N_0 = 100
    states, actions, rewards = [], [], []
    observation, info = _env.reset()
    global successes

    while True:
        N_s[observation] += 1
        epsilon = N_0/(N_0 + N_s[observation])
        
        action = policy(Q_func, observation, epsilon)
        states.append(observation)
        actions.append(action)
        
        observation, reward, terminated, truncated, info = _env.step(action)
        rewards.append(reward)
        
        if terminated or truncated:
            if rewards[-1] == 1:
                successes += 1
            break

    return states, actions, rewards

def FVMC_Control(policy, _env, n_ep):
    Q_func = defaultdict(lambda: [0] * 4)
    N = defaultdict(lambda: [0] * 4)
    N_s = defaultdict(int)

    for _ in range(n_ep):
        states, actions, rewards = generate_episode(policy, _env, Q_func, N_s)
        returns = 0

        for i in range(len(states) - 1, -1, -1):
            S = states[i]
            R = rewards[i]
            A = actions[i]
            returns += R

            if S not in states[:i]:
                N[S][A] += 1
                step_size = 1 / (N[S][A])
                Q_func[S][A] += (returns - Q_func[S][A]) * step_size

    return Q_func

Q_func = FVMC_Control(greedy, env, 1000)
print("With sparse rewards:", successes)

env.close()

With sparse rewards: 330


In [114]:
env_1 = gym.make('FrozenLake-v1', desc=["SFFF", "FHFH", "FFFH", "HFFG"], is_slippery=False, render_mode='human')
observation, info = wrapped_env.reset(seed=42)

for _ in range(10):
    generate_episode(epsilon_greedy, env_1, Q_func, defaultdict(int))
env_1.close()

In [113]:
Q_func

defaultdict(<function __main__.FVMC_Control.<locals>.<lambda>()>,
            {6: [0.0, 0.4285714285714286, 0.0, 0.0],
             2: [0.0, 0.0, 0.0, 0.0],
             1: [0.11764705882352941, 0.0, 0.0, 0.0],
             0: [0.08571428571428576,
              0.36509758897818606,
              0.08571428571428572,
              0.125],
             4: [0.08680555555555562, 0.7466666666666669, 0.0, 0.0],
             3: [0.0, 0.0, 0.0, 0.0],
             14: [0.7692307692307692,
              0.8636363636363635,
              1.0,
              0.6944444444444444],
             10: [0.5625, 0.864406779661017, 0.0, 0.17647058823529416],
             9: [0.5531914893617023,
              0.8364197530864198,
              0.5689655172413793,
              0.0],
             8: [0.4893617021276596,
              0.0,
              0.7248157248157256,
              0.4137931034482759],
             13: [0.0,
              0.7096774193548387,
              0.95539033457249,
              0