## Imports

In [3]:
import gym
import numpy as np

In [25]:
"""Task 0. Monte Carlo"""


def monte_carlo(env, V, policy, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99):
    """Performs Monte Carlo evaluation for the given environment.
    Args:
        env: The environment to evaluate.
        V: The value function to update.
        policy: The policy to evaluate.
        episodes: The number of episodes to sample.
        max_steps: The maximum number of steps per episode.
        alpha: The learning rate.
        gamma: The discount factor.
    Returns:
        The updated value function.
    """
    for ep in range(episodes):
        state, _ = env.reset(seed=0)
        episode = []
        for _ in range(max_steps):
            action = policy(state)
            next_state, reward, done, _, _ = env.step(action)
            episode.append((state, reward))
            if done:
                break
            state = next_state
        episode = np.array(episode, dtype=int)
        G = 0
        for s, r in reversed(episode):
            G = gamma * G + r
            if s not in episode[:ep, 0]:
                V[s] += alpha * (G - V[s])
    return V


In [26]:
np.random.seed(0)

env = gym.make('FrozenLake8x8-v1')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64') 
np.set_printoptions(precision=4)
# env.seed(0)
print(monte_carlo(env, V, policy).reshape((8, 8)))

  if not isinstance(terminated, (bool, np.bool8)):


[[ 0.9     0.6561  0.6561  0.729   0.5905  0.81    1.      1.    ]
 [ 0.729   0.3874  0.5314  0.5314  0.5905  0.729   0.729   1.    ]
 [ 0.4305  0.3138  0.2059 -1.      1.      0.9     0.729   0.81  ]
 [ 0.81    0.4305  0.2288  0.6561  0.81   -1.      0.729   0.5314]
 [ 1.      0.5905  0.4305 -1.      1.      1.      0.9     0.81  ]
 [ 1.     -1.     -1.      1.      1.      1.     -1.      0.9   ]
 [ 1.     -1.      1.      1.     -1.      1.     -1.      1.    ]
 [ 1.      1.      1.     -1.      1.      1.      1.      1.    ]]


In [39]:
"""Task 2: TD(λ)"""


def td_lambtha(env, V, policy, lambtha, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99):
    """Performs TD(λ) evaluation for the given environment.
    Args:
        env: The environment to evaluate.
        V: The value function to update.
        policy: The policy to evaluate.
        lambtha: The eligibility trace decay rate.
        episodes: The number of episodes to sample.
        max_steps: The maximum number of steps per episode.
        alpha: The learning rate.
        gamma: The discount factor.
    Returns:
        The updated value function.
    """

    E = np.zeros((64,))
    for ep in range(episodes):
        state, _ = env.reset(seed=0)
        for _ in range(max_steps):
            action = policy(state)
            next_state, reward, done, _, _ = env.step(action)
            E *= gamma * lambtha
            E[state] += 1
            delta = reward + gamma * V[next_state] - V[state]
            V += alpha * delta * E
            if done:
                break
            state = next_state
    return V

In [40]:
np.random.seed(0)

env = gym.make('FrozenLake8x8-v1')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64') 
np.set_printoptions(precision=4)
print(td_lambtha(env, V, policy, 0.9).reshape((8, 8)))

[[-0.8059 -0.8203 -0.8286 -0.8209 -0.8315 -0.7777 -0.7871 -0.6169]
 [-0.8083 -0.8615 -0.8464 -0.8357 -0.8036 -0.7733 -0.6496 -0.5982]
 [-0.8721 -0.9016 -0.951  -1.     -0.8918 -0.8751 -0.7047 -0.5271]
 [-0.9075 -0.9154 -0.951  -0.9654 -0.9606 -1.     -0.547  -0.3582]
 [-0.9112 -0.9247 -0.932  -1.     -0.8593 -0.7466 -0.6928 -0.3524]
 [-0.8534 -1.     -1.      1.     -0.147   0.1954 -1.      0.3366]
 [-0.2465 -1.      1.      1.     -1.      0.0771 -1.      1.208 ]
 [ 1.      1.      1.     -1.      1.      0.6478  1.      1.    ]]


In [43]:
"""Task 2: SARSA(λ)"""


def sarsa_lambtha(env, Q, lambtha, episodes=5000, max_steps=100, alpha=0.1,
                  gamma=0.99, epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):
    """Perfroms SARSA(λ) for the given environment.
    Args:
        env: The environment to evaluate.
        Q: numpy.ndarray of shape (s,a) containing Q table.
        lambtha: The eligibility trace decay rate.
        episodes: The number of episodes to sample.
        max_steps: The maximum number of steps per episode.
        alpha: The learning rate.
        gamma: The discount factor.
        epsilon: The initial threshold for epsilon greedy.
        min_epsilon: The minimum exploration rate.
        epsilon_decay: The decay rate for exploration.
    Returns:
        The updated Q table.
    """

    for ep in range(episodes):
        state, _ = env.reset()
        action = epsilon_greedy(Q, state, epsilon)
        E = np.zeros((Q.shape))
        for _ in range(max_steps):
            next_state, reward, done, _, _ = env.step(action)
            next_action = epsilon_greedy(Q, next_state, epsilon)
            delta = reward + gamma * Q[next_state, next_action] - Q[state, action]
            E[state, action] += 1
            Q += alpha * delta * E
            E *= gamma * lambtha
            if done:
                break
            state = next_state
            action = next_action
        epsilon = max(min_epsilon, epsilon * (1 - epsilon_decay))
    return Q

def epsilon_greedy(Q, state, epsilon):
    """Calculates epsilon greedy action"""
    p = np.random.uniform()
    if p > epsilon:
        return np.argmax(Q[state])
    else:
        return np.random.randint(Q.shape[1])

In [44]:
np.random.seed(0)
env = gym.make('FrozenLake8x8-v1')
Q = np.random.uniform(size=(64, 4))
np.set_printoptions(precision=4)
print(sarsa_lambtha(env, Q, 0.9))

  if not isinstance(terminated, (bool, np.bool8)):


[[0.5038 0.5197 0.5231 0.5496]
 [0.5202 0.5395 0.5328 0.5705]
 [0.5045 0.5311 0.5992 0.5286]
 [0.5101 0.6131 0.4998 0.5283]
 [0.5777 0.5914 0.6389 0.5344]
 [0.6382 0.6429 0.7349 0.6463]
 [0.6589 0.6668 0.7645 0.6655]
 [0.7644 0.6905 0.652  0.6673]
 [0.5331 0.4984 0.6042 0.4867]
 [0.6848 0.5349 0.5134 0.5431]
 [0.6316 0.6052 0.5155 0.541 ]
 [0.455  0.467  0.4459 0.6472]
 [0.6435 0.5577 0.5632 0.4503]
 [0.6397 0.6155 0.7814 0.6292]
 [0.797  0.665  0.6829 0.6859]
 [0.7622 0.6463 0.6706 0.6228]
 [0.5266 0.5316 0.6804 0.5203]
 [0.5558 0.7862 0.5938 0.5646]
 [0.728  0.4675 0.5143 0.5146]
 [0.2828 0.1202 0.2961 0.1187]
 [0.4978 0.6453 0.449  0.5209]
 [0.653  0.6892 0.8251 0.7279]
 [0.7479 0.796  0.7288 0.6958]
 [0.7782 0.7459 0.6878 0.6599]
 [0.4435 0.4586 0.6811 0.5292]
 [0.5959 0.6256 0.832  0.6278]
 [0.6254 0.818  0.6004 0.6068]
 [0.6067 0.8102 0.5932 0.5548]
 [0.6609 0.6136 0.8124 0.6239]
 [0.8811 0.5813 0.8817 0.6925]
 [0.7378 0.8357 0.7684 0.7789]
 [0.7943 0.7378 0.6252 0.6248]
 [0.4851