Tabular Q learning on Frozen Lake environment.
algorithm : Start with an empty table \
Obtain (s,a,r,s') from the environment \
Make a bellman update: \
Q(s,a) -> (1-$\alpha$) * Q(s,a) + $\alpha$ * (r + $\gamma$ * max(Q(s',a'))) \
check convergene condition  , else repeat step 2

In [1]:
!pip install tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [5]:
# import libraries
import gymnasium as gym
import collections
from tensorboardX import SummaryWriter

ENV_NAME = "FrozenLake-v1"
GAMMA = 0.9
ALPHA = 0.2
TEST_EPISODES = 20




In [13]:
class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.state = self.env.reset()[0]
        self.values = collections.defaultdict(float)

    def sample_env(self):
        action = self.env.action_space.sample()
        old_state = self.state
        new_state, reward, is_done, _ ,_= self.env.step(action)
        self.state = self.env.reset()[0] if is_done else new_state
        return (old_state, action, reward, new_state)

    def best_value_and_action(self, state):
        best_value, best_action = None, None
        for action in range(self.env.action_space.n):
            action_value = self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_value, best_action

    def value_update(self, s, a, r, next_s):
        best_v, _ = self.best_value_and_action(next_s)
        new_val = r + GAMMA * best_v
        old_val = self.values[(s, a)]
        self.values[(s, a)] = old_val * (1-ALPHA) + new_val * ALPHA

    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()[0]
        while True:
            _, action = self.best_value_and_action(state)
            new_state, reward, is_done, _,_ = env.step(action)
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward









In [14]:
# test environment
test_env = gym.make(ENV_NAME)
agent = Agent()
iter_no = 0
best_reward = 0.0
while True:
  iter_no += 1
  s, a,r, s_dash = agent.sample_env()
  agent.value_update(s,a,r,s_dash)
  reward = 0.0
  for _ in range(TEST_EPISODES):
    reward += agent.play_episode(test_env)
  reward /= TEST_EPISODES

  if reward > best_reward:
    best_reward = reward
    print(f"the best reward is {best_reward:.2f}")

  if reward > 0.80:
    print("solved")
    break




the best reward is 0.25
the best reward is 0.35
the best reward is 0.40
the best reward is 0.45
the best reward is 0.80
the best reward is 0.85
solved
