# Setting Up The Environment & Testing It

In [1]:
import time
import numpy as np
import gymnasium as gym

In [2]:
from gymnasium.envs.registration import register
from gymnasium.envs import registry

register(
    id='FrozenLakeNotSlippery-v0', # make sure this is a custom name!
    entry_point='gymnasium.envs.toy_text.frozen_lake:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100, # how many steps the agent will take before just giving up
    reward_threshold=.8196, # this is more applicable for continuous rewards
)

print(f"Check if env was registered: {'FrozenLakeNotSlippery-v0' in registry.keys()}")

Check if env was registered: True


In [3]:
import matplotlib.pyplot as plt
from IPython.display import clear_output

# You can view this game multiple ways:
# human (graphical game window)
# rgb_array (pixel data)
# ansi (text based)
render_mode = "ansi"
env = gym.make('FrozenLakeNotSlippery-v0', render_mode=render_mode)
env.reset()

for _ in range(10):

    # render environment accordingly
    if render_mode == "human":
        env.render()
    else:
        img = env.render()
        if render_mode == "ansi":
            print(img)
        else:
            plt.imshow(img)
            plt.axis('off')
            plt.show()

    # select a random action
    action = env.action_space.sample()
    
    # go forward with this action
    observation, reward, done, truncated, info = env.step(action)
    
    time.sleep(0.5)
    clear_output(wait=True)

    if done:
        env.reset()

env.close()

  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG



# Hyperparameter Initialization

In [8]:
# rows -> states, cols -> actions(?)
state_size = env.observation_space.n 
print(f'state size = {state_size}')
action_size = env.action_space.n
print(f'action size = {action_size}')
q_table = np.zeros([state_size, action_size])
print(f'q-table = {q_table}')

state size = 16
action size = 4
q-table = [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [9]:
# Also referred to as EPOCHS, 
# running for a few thousands is how many times 
# it takes the agent of playing the game to start showing results
EPISODES = 20000 

# The LEARNING RATE, too low converges too fast, too high converges too quick
ALPHA = 0.8

# The DISCOUNT RATE, applied to 'future' rewards, so that recent rewards are worth more
GAMMA = 0.95

In [10]:
# For the epsilon-greedy algorithm
# Epsilon will start at the max value and will decrease to no less than the min value
# based on a function that we choose (in this case exponential decay)
# and we also state the rate of decay
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001

# Update Functions

In [12]:
def epsilon_greedy_action_selection(epsilon, q_table, discrete_state):
    """
        Explaining Epsilon-Greedy Logic: 
        Since epsilon starts at 1.0, and our random number generator doesn't include 1.0 and
        we're using an exponential decay function for our epsilon, for the beginning we will 
        be mostly exploring a lot (AKA choosing the random action). However, as epsilon gets
        smaller over time, we will be exploiting what we've learned more and more, and exploring
        only very little.
    """

    random_number = np.random.random()

    # EXPLOITATION (choose the action that maximizes Q)
    if random_number > epsilon:

        # For the given (discrete) state, grab that row
        state_row = q_table[discrete_state, :]

        # Since the indeces correspond to the action 1:1, 
        # then argmax here works well for us since it returns
        # the array index of the maximum value
        action = np.argmax(state_row)
        
    # EXPLORATION (choose a random action)
    else:
        action = env.action_space.sample()

    return action

In [17]:
def compute_next_q_value(old_q_value, reward, next_optimal_q_value):
    """
        Q-Learning Update Equation
    """
    return old_q_value + ALPHA*(reward + GAMMA*next_optimal_q_value - old_q_value)

In [16]:
def reduce_epsilon(epsilon, episode):
    """
        Exponential Decay for the epsilon parameter
    """
    return min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

# Agent Training

In [20]:
# Just to keep track of rewards
rewards = []
log_interval = 1000

for episode in range(EPISODES):

    # For every episode, reset environment and total rewards
    state = env.reset()[0] # env.reset() returns [observation, info], we just want the observation
    done = False
    total_rewards = 0

    # Agent plays the game
    while not done:
        # 3 ways to be done:
        # Winning the game
        # Losing the game (falling in a hole)
        # Hitting the max_episodes_steps in the environment delcaration

        # Get ACTION
        action = epsilon_greedy_action_selection(epsilon, q_table, state)

        # Perform ACTION
        new_state, reward, done, trunc, info = env.step(action)

        # OLD (current) Q Value Q(st, at)
        old_q_value = q_table[state, action]

        # Get next optimal Q Value (what's the max Q value for this state) Q(st+1, at+1)
        next_optimal_q_value = np.max(q_table[new_state, :])

        # Compute the next Q Value
        next_q_value = compute_next_q_value(old_q_value, reward, next_optimal_q_value) 

        # Update the Q Table
        q_table[state, action] = next_q_value

        # Track the rewards
        total_rewards = total_rewards + reward

        # new_state is not the state
        state = new_state

    # Agent finished a game
    episode += 1

    # We want to reduce epsilon after each game, not DURING the game
    epsilon = reduce_epsilon(epsilon, episode)

    # For plotting purposes, keep track of rewards
    rewards.append(total_rewards)

    # To make sure it's working
    if episode % log_interval == 0:
        print(f'Total sum of our rewards: {np.sum(rewards)}')

env.close()

Total sum of our rewards: 289.0
Total sum of our rewards: 984.0
Total sum of our rewards: 1879.0
Total sum of our rewards: 2835.0
Total sum of our rewards: 3806.0
Total sum of our rewards: 4795.0
Total sum of our rewards: 5784.0
Total sum of our rewards: 6776.0
Total sum of our rewards: 7767.0
Total sum of our rewards: 8757.0
Total sum of our rewards: 9740.0
Total sum of our rewards: 10731.0
Total sum of our rewards: 11722.0
Total sum of our rewards: 12716.0
Total sum of our rewards: 13705.0
Total sum of our rewards: 14697.0
Total sum of our rewards: 15688.0
Total sum of our rewards: 16680.0
Total sum of our rewards: 17673.0
Total sum of our rewards: 18665.0
