In [1]:
import gym, random
import numpy as np
from collections import Counter

In [2]:
env = gym.make("FrozenLake-v1", is_slippery=False)
state, _ = env.reset()
n_states = env.observation_space.n
n_actions = env.action_space.n
print(n_states)
print(n_actions)

16
4


In [3]:
def initialize_q_table(n_state, n_actions):
    q_table = Counter()
    for state in range(n_state):
        for action in range(n_actions):
            q_table[(state, action)] = 0
    return q_table
    

In [4]:
q_table = initialize_q_table(n_states, n_actions)

In [5]:
def select_action(state, q_table, n_actions, epsilon=0.5):
    rand_float = random.random()
    if rand_float < epsilon:
        action = random.randrange(start=0, stop=n_actions)
        # print("Picking random action")
    else:
        # print("Picking max value action")
        action = None
        max_q_val = float('-inf')
        for act in range(n_actions):
            if q_table[(state, act)] > max_q_val:
                max_q_val = q_table[(state, act)]
                action = act
    return action

In [6]:
select_action(state, q_table, n_actions)

0

In [7]:
def update_table(env, q_table, state, action, n_actions, alpha=0.1, gamma=0.99):
    nxt_st, reward, terminated, truncated, _ = env.step(action=action)
    q_val = q_table[(state, action)]
    max_next_q = max([q_table[nxt_st, n] for n in range(n_actions)])
    q_table[(state, action)] = q_val + alpha * (reward + (gamma * max_next_q) - q_val)
    return nxt_st, reward, terminated or truncated

In [8]:
def training_loop(env, q_table, n_actions, episodes=1000, epsilon=1.0, decay_rate=0.01, max_steps=200):
    rewards_per_episode = []
    
    for ep in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        step_count = 0
        
        while True:
            nxt_action = select_action(state, q_table, n_actions, epsilon=epsilon)
            nx_st, reward, is_done = update_table(env, q_table, state, nxt_action, n_actions)
            total_reward += reward
            step_count += 1
            
            # Break if episode is done (goal reached or hole fallen into)
            if is_done:
                break
            
            # Break if maximum steps reached (timeout)
            if step_count >= max_steps:
                break
                
            state = nx_st
        
        rewards_per_episode.append(total_reward)
        
        # Decay epsilon AFTER each episode
        epsilon = max(0.01, epsilon * 0.995)
        
        if ep % 100 == 0:
            avg_reward = np.mean(rewards_per_episode[-100:]) if len(rewards_per_episode) >= 100 else np.mean(rewards_per_episode)
            print(f"Episode {ep}: Total reward = {total_reward}, Steps = {step_count}, Average reward (last 100) = {avg_reward:.3f}, Epsilon = {epsilon:.3f}")
    
    return rewards_per_episode

In [9]:
q_table = initialize_q_table(n_states, n_actions)

# Train the agent
print("Starting training...")
rewards = training_loop(env, q_table, n_actions, episodes=30000)

Starting training...
Episode 0: Total reward = 0.0, Steps = 3, Average reward (last 100) = 0.000, Epsilon = 0.995
Episode 100: Total reward = 1.0, Steps = 12, Average reward (last 100) = 0.090, Epsilon = 0.603
Episode 200: Total reward = 0.0, Steps = 7, Average reward (last 100) = 0.390, Epsilon = 0.365
Episode 300: Total reward = 0.0, Steps = 4, Average reward (last 100) = 0.690, Epsilon = 0.221
Episode 400: Total reward = 1.0, Steps = 6, Average reward (last 100) = 0.850, Epsilon = 0.134
Episode 500: Total reward = 0.0, Steps = 3, Average reward (last 100) = 0.900, Epsilon = 0.081
Episode 600: Total reward = 1.0, Steps = 6, Average reward (last 100) = 0.900, Epsilon = 0.049
Episode 700: Total reward = 1.0, Steps = 6, Average reward (last 100) = 0.980, Epsilon = 0.030
Episode 800: Total reward = 1.0, Steps = 6, Average reward (last 100) = 0.980, Epsilon = 0.018
Episode 900: Total reward = 1.0, Steps = 6, Average reward (last 100) = 0.990, Epsilon = 0.011
Episode 1000: Total reward = 1

  if not isinstance(terminated, (bool, np.bool8)):


Episode 1600: Total reward = 1.0, Steps = 6, Average reward (last 100) = 0.970, Epsilon = 0.010
Episode 1700: Total reward = 1.0, Steps = 6, Average reward (last 100) = 1.000, Epsilon = 0.010
Episode 1800: Total reward = 1.0, Steps = 6, Average reward (last 100) = 0.980, Epsilon = 0.010
Episode 1900: Total reward = 1.0, Steps = 6, Average reward (last 100) = 0.990, Epsilon = 0.010
Episode 2000: Total reward = 1.0, Steps = 8, Average reward (last 100) = 0.990, Epsilon = 0.010
Episode 2100: Total reward = 1.0, Steps = 6, Average reward (last 100) = 0.980, Epsilon = 0.010
Episode 2200: Total reward = 1.0, Steps = 6, Average reward (last 100) = 1.000, Epsilon = 0.010
Episode 2300: Total reward = 1.0, Steps = 6, Average reward (last 100) = 1.000, Epsilon = 0.010
Episode 2400: Total reward = 1.0, Steps = 6, Average reward (last 100) = 0.990, Epsilon = 0.010
Episode 2500: Total reward = 1.0, Steps = 6, Average reward (last 100) = 1.000, Epsilon = 0.010
Episode 2600: Total reward = 1.0, Steps 