In [1]:
import gymnasium as gym
from gymnasium import spaces
from collections import defaultdict
import numpy as np

class NegativeHoleRewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)

    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        if reward == 0 and terminated:
            reward = -1
        
        return observation, reward, terminated, truncated, info

env = gym.make('FrozenLake-v1', desc=["SFFF", "FHFH", "FFFH", "HFFG"])
wrapped_env = NegativeHoleRewardWrapper(env)
observation, info = wrapped_env.reset(seed=42)

In [2]:
def epsilon_greedy(Q_func, S, epsilon):
    r = np.random.random()
    if r > epsilon:
        return np.argmax(Q_func[S])
    else:
        return np.random.choice([0, 1, 2, 3])

def greedy(Q_func, S):
    return np.argmax(Q_func[S])

def Q_learning(_env, n_ep):
    Q_func = defaultdict(lambda: [np.random.random() * 0.05] * 4)
    decay_min = 0.01
    decay_const = 0.01
    
    successes = 0
    _gamma = 0.99
    _alpha = 0.5

    observation, info = _env.reset()
    behaviour_policy = epsilon_greedy
    target_policy = greedy

    for i in range(n_ep):
        observation, info = _env.reset()
        S = observation

        epsilon = decay_min + (1 - decay_min) * np.exp(-decay_const * i)
        
        # if agent is slow kill and replace
        if (i % 200) == 0 and successes < 5:
            Q_func = defaultdict(lambda: [np.random.random() * 0.05] * 4)
            
        while True:
            A = behaviour_policy(Q_func, S, epsilon)
            S_prime, R, terminated, truncated, info = _env.step(A)
    
            Q_func[S][A] += _alpha * (R + _gamma * np.max(Q_func[S_prime]) - Q_func[S][A])
            
            S = S_prime
            
            if terminated or truncated:
                if R == 1:
                    successes += 1
                break
                
    print("Number of successes:", successes)
    return Q_func

In [3]:
Q_func = Q_learning(env, 5000) 
# gets it a bit more than half the time
env.close()

Number of successes: 2929


In [4]:
def greedy(Q_func, S):
    return np.argmax(Q_func[S])

def generate_episode(policy, _env, Q_func):
    observation, info = _env.reset()

    while True:
        action = policy(Q_func, observation)
        observation, reward, terminated, truncated, info = _env.step(action)
        
        if terminated or truncated:
            break

env_1 = gym.make('FrozenLake-v1', desc=["SFFF", "FHFH", "FFFH", "HFFG"], render_mode='human')
observation, info = wrapped_env.reset(seed=42)

for _ in range(10):
    generate_episode(greedy, env_1, Q_func)
    
env_1.close()