In [74]:
import gymnasium as gym
from gymnasium import spaces
from collections import defaultdict
import numpy as np

class NegativeHoleRewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)

    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        if reward == 0 and terminated:
            reward = -1
        
        return observation, reward, terminated, truncated, info

env = gym.make('FrozenLake-v1', desc=["SFFF", "FHFH", "FFFH", "HFFG"], is_slippery=False)
wrapped_env = NegativeHoleRewardWrapper(env)
observation, info = wrapped_env.reset(seed=42)

In [107]:
successes = 0
    
def epsilon_greedy(Q_func, S, epsilon):
    r = np.random.random()
    if r > epsilon:
        return np.argmax(Q_func[S])
    else:
        return np.random.choice([0, 1, 2, 3])
        
def generate_episode(policy, _env, Q_func, N_s):
    N_0 = 100
    states, actions, rewards = [], [], []
    observation, info = _env.reset()
    global successes

    while True:
        N_s[observation] += 1
        epsilon = N_0/(N_0 + N_s[observation])
        
        action = policy(Q_func, observation, epsilon)
        states.append(observation)
        actions.append(action)
        
        observation, reward, terminated, truncated, info = _env.step(action)
        rewards.append(reward)
        
        if terminated or truncated:
            if rewards[-1] == 1:
                successes += 1
            break

    return states, actions, rewards

def FVMC_Control(policy, _env, n_ep):
    Q_func = defaultdict(lambda: [0] * 4)
    N = defaultdict(lambda: [0] * 4)
    N_s = defaultdict(int)

    for _ in range(n_ep):
        states, actions, rewards = generate_episode(policy, _env, Q_func, N_s)
        returns = 0

        for i in range(len(states) - 1, -1, -1):
            S = states[i]
            R = rewards[i]
            A = actions[i]
            returns += R

            if S not in states[:i]:
                N[S][A] += 1
                step_size = 1 / (N[S][A])
                Q_func[S][A] += (returns - Q_func[S][A]) * step_size

    return Q_func

# literally the best way to train here is to test 1000 episodes and gamble for a high success rate
Q_func = FVMC_Control(epsilon_greedy, env, 1000) 
print("With sparse rewards:", successes)

env.close()

With sparse rewards: 330


In [118]:
def greedy(Q_func, S, epsilon):
    return np.argmax(Q_func[S])

env_1 = gym.make('FrozenLake-v1', desc=["SFFF", "FHFH", "FFFH", "HFFG"], is_slippery=False, render_mode='human')
observation, info = wrapped_env.reset(seed=42)

for _ in range(10):
    generate_episode(greedy, env_1, Q_func, defaultdict(int))
env_1.close()