## Task 0: Load the Environment

In [135]:
"""Task 0. Load the Environment"""
import gym


def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    """Load the FrozenLakeEnv environment from OpenAI Gym"""
    env = gym.make("FrozenLake-v1",
                   desc=desc,
                   map_name=map_name,
                   is_slippery=is_slippery,
                   render_mode="ansi")
    return env

In [136]:
import numpy as np

np.random.seed(0)
env = load_frozen_lake()
print(env.desc)
print(env.P[0][0])
env = load_frozen_lake(is_slippery=True)
print(env.desc)
print(env.P[0][0])
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
print(env.desc)
env = load_frozen_lake(map_name='4x4')
print(env.desc)

[[b'S' b'F' b'F' b'F' b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'F' b'F' b'H' b'F' b'F']
 [b'F' b'H' b'F' b'H' b'H' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'H' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'H' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'H' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'G']]
[(1.0, 0, 0.0, False)]
[[b'S' b'F' b'H' b'F' b'H' b'F' b'H' b'F']
 [b'H' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'F' b'F' b'F' b'F' b'F']
 [b'F' b'F' b'H' b'F' b'F' b'F' b'F' b'H']
 [b'F' b'F' b'F' b'F' b'F' b'H' b'F' b'H']
 [b'F' b'F' b'H' b'F' b'H' b'F' b'H' b'F']
 [b'F' b'F' b'H' b'F' b'F' b'F' b'F' b'G']]
[(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 8, 0.0, True)]
[[b'S' b'F' b'F']
 [b'F' b'H' b'H']
 [b'F' b'F' b'G']]
[[b'S' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'H']
 [b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'G']]


In [137]:
"""Task 1. Initialize the Q-table"""
import numpy as np


def q_init(env):
    """Initialize the Q-table"""
    action_space_size = env.action_space.n
    state_space_size = env.observation_space.n
    q_table = np.zeros((state_space_size, action_space_size))
    return q_table

In [138]:
env = load_frozen_lake()
Q = q_init(env)
print(Q.shape)
env = load_frozen_lake(is_slippery=True)
Q = q_init(env)
print(Q.shape)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)
print(Q.shape)
env = load_frozen_lake(map_name='4x4')
Q = q_init(env)
print(Q.shape)

(64, 4)
(64, 4)
(9, 4)
(16, 4)


In [139]:
"""Task 2. Epsilon Greedy"""
import numpy as np


def epsilon_greedy(Q, state, epsilon):
    """Uses Epsilon Greedy to Determine the next action.
    Args:
        Q: numpy.ndarray containing the q-table
        state: Current state
        epsilon: epsilon to use for the calculation
    Returns:
        the next action index"""
    
    p = np.random.uniform()
    if p < epsilon:
        return np.random.randint(4)
    else:
        return np.argmax(Q[state])


In [140]:
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)
Q[7] = np.array([0.5, 0.7, 1, -1])
np.random.seed(0)
print(epsilon_greedy(Q, 7, 0.5))
np.random.seed(1)
print(epsilon_greedy(Q, 7, 0.5))

2
0


In [141]:
"""Task 3. Q-learning"""


def train(env, Q, episodes=5000, max_steps=100, alpha=0.1, gamma=0.99,
          epsilon=1, min_epsilon=0.1, epsilon_decay=0.05):
    """Performs Q-learning:
    Args:
        env: the FrozenLakeEnv instance
        Q: numpy.ndarray containging the Q-table
        episodes: total number of episodes to train over
        max_steps: maximum number of steps per episode
        alpha: learning rate
        gamma: discount rate
        epsilon: initial threshold for epsilon greedy
        min_epsilon: minimum value for updating epsiolon between episodes
    Return:
        Q: the updated Q-table
        total_rewards: list containing the rewards per episode"""

    total_rewards = []
    for episode in range(episodes):
        state, _ = env.reset()
        rewards_current_episode = 0

        for _ in range(max_steps):
            action = epsilon_greedy(Q, state, epsilon)
            new_state, reward, done, _, _ = env.step(action)

            if reward == 1 and done:
                rewards_current_episode += reward
            elif reward == 0 and done:
                rewards_current_episode -= 1
                reward = -1

            Q[state, action] = (1 - alpha) * Q[state, action] + \
                alpha * (reward + gamma * np.max(Q[new_state]))
            state = new_state

            if done:
                break
        epsilon = (1 - min_epsilon) * np.exp(-epsilon_decay * episode) +\
            min_epsilon
        total_rewards.append(rewards_current_episode)

    return Q, total_rewards

In [142]:
np.random.seed(0)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)

Q, total_rewards  = train(env, Q)
print(Q)
split_rewards = np.split(np.array(total_rewards), 10)
for i, rewards in enumerate(split_rewards):
    print((i+1) * 500, ':', np.mean(rewards))

[[ 0.96059593  0.970299    0.95098488  0.96059396]
 [ 0.96059557 -0.77123208  0.0094072   0.37627228]
 [ 0.18061285 -0.1         0.          0.        ]
 [ 0.97029877  0.9801     -0.99999988  0.96059583]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.98009763  0.98009933  0.99        0.9702983 ]
 [ 0.98009922  0.98999782  1.         -0.99999952]
 [ 0.          0.          0.          0.        ]]
500 : 0.812
1000 : 0.88
1500 : 0.9
2000 : 0.9
2500 : 0.88
3000 : 0.844
3500 : 0.892
4000 : 0.896
4500 : 0.852
5000 : 0.928


In [143]:
def play(env, Q, max_steps=100):
    """Plays an episode using the trained Q-table
    Args:
        env: FrozenLakeEnv instance
        Q: numpy.ndarray shape (state, action) containing the trained Q-table
        max_steps: Maximum number of steps in the episode
    Returns:
        total_rewards: the total rewards for the episode
    """
    total_rewards = 0
    state, _ = env.reset()
    print(env.render())

    for _ in range(max_steps):
        action = np.argmax(Q[state])
        state, reward, terminated, _, _ = env.step(action)
        total_rewards += reward
        print(env.render())
        if terminated:
            break
    return total_rewards

In [144]:
np.random.seed(0)
desc = [['S', 'F', 'F'], ['F', 'H', 'H'], ['F', 'F', 'G']]
env = load_frozen_lake(desc=desc)
Q = q_init(env)

Q, total_rewards  = train(env, Q)
print(play(env, Q))


[41mS[0mFF
FHH
FFG

  (Down)
SFF
[41mF[0mHH
FFG

  (Down)
SFF
FHH
[41mF[0mFG

  (Right)
SFF
FHH
F[41mF[0mG

  (Right)
SFF
FHH
FF[41mG[0m

1.0
