In [124]:
import pandas as pd
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
from random import random
from tqdm import tqdm
import plotly.express as px

In [125]:
def make_new_Q(env):
    states = range(env.observation_space.start, env.observation_space.n)
    actions = range(env.action_space.start, env.action_space.n)
    return pd.DataFrame(0.0, index=states, columns=actions)


def policy(s, Q, eps, env):
    if random() < eps:
        return env.action_space.sample()
    return Q.loc[s].idxmax()

In [126]:
env = gym.make('CliffWalking-v0')

gamma = 1.0
eps = 0.01
lr = 0.2

max_steps = 10_000

In [173]:
expected_episode_returns = []
realized_episode_returns = []

Q = make_new_Q(env)

s, _ = env.reset()
s_initial = s # for CliffWalking, the initial state is deterministic
episode_r = 0.0
for _ in tqdm(range(max_steps)):

    # step
    a = policy(s, Q, eps, env)
    s2, r, terminated, truncated, _ = env.step(a)
    episode_r += float(r)

    # update Q
    update_target = r + gamma * Q.loc[s2].max()
    Q.at[s, a] = (1 - lr) * Q.at[s, a] + lr * update_target

    # logging
    expected_episode_returns.append(Q.loc[s_initial].max())

    # next step
    if terminated or truncated:
        realized_episode_returns.append(episode_r)
        episode_r = 0
        s, _ = env.reset()
    else:
        s = s2


100%|██████████| 10000/10000 [00:00<00:00, 17543.49it/s]


Environment rewards: -1 everywhere, except becomes -100 if the agent hits ones of the cliff states

Should be 13 steps from start to end if moving optimally... horizontal lines below are at y=-13

In [177]:
px.line(expected_episode_returns).update_layout(title='V(s0) throughout training', width=500, height=400, xaxis_title='timesteps', yaxis_title='V(s0)', showlegend=False).add_hline(y=-13, line_width=1).show()

In [179]:
px.line(realized_episode_returns).update_layout(title='realized episode reward', width=500, height=400, xaxis_title='episodes', yaxis_title='episode reward', showlegend=False).add_hline(y=-13, line_width=1).show()

In [180]:
learned_actions = Q.max(axis=1).values.reshape(4, 12)
px.imshow(learned_actions, color_continuous_scale='Viridis').update_layout(title='max_a Q(s, a)', width=500, height=400)