In [4]:
# Importing requirments
import numpy as np
import gymnasium as gym
import random, time
from IPython.display import clear_output

In [6]:
# We are going to use frozen lake environment
# Let's create the environment
env = gym.make('FrozenLake-v1')

In [11]:
# Now let's create the Q-table and initialize all the values to zero for each state action pair
# No. of rows = size of the state space in the environment
# No. of columns = size of the action space in the environment
state_space_size = env.observation_space.n
action_space_size = env.action_space.n
print('State space size: ',state_space_size)
print('Action space size: ',action_space_size)

State space size:  16
Action space size:  4


In [15]:
# Now let's create the Q-table
q_table = np.zeros((state_space_size, action_space_size))
print('Q-table size: ',q_table.shape)

Q-table size:  (16, 4)


In [17]:
# Let's initialize all the parameters 
num_episodes = 1000
max_step_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

# These are exploration and exploitation trede-off
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [24]:
rewards_all_episode = []

# Let's implement Q-learning algorithm
# Everything that happens in a single episode
for episode in range(num_episodes):
    # Reset our environment to the starting state
    # env.reset() return two values 1) state number 2) probability
    state = env.reset()
    # Keep track of whether the given episode is end or not
    done = False
    # Initially reward is 0
    rewards_current_episode = 0

    # Everything that happens in single time step of each episode
    for step in range(max_step_per_episode):
        # We generate random number to determin whether the agent will choose exploration or exploitation
        exploration_rate_threshold = random.uniform(0, 1)
        # Exploitation
        if exploration_rate_threshold > exploration_rate:
            # Choose the highest Q-value in the in the Q-table for the current state
            action = np.argmax(q_table[state[0],:])
        # Exploration
        else:
            # Explore the environment by sample an action randomly
            action = env.action_space.sample()
        
        # We take that action from that step and move to new step
        # It reterns new state, the reward for the action we took, action ended the episode or not, truncate the episode or not, information about environment
        new_state, reward, done, truncated, info = env.step(action=action)

        # After the reward we get from the action on the state we update Q-table for q(state, action)
        # This is the equation to find optimal Q-value
        q_table[state, action] = ((1 - learning_rate) * q_table[state, action]) + (learning_rate * (reward + (discount_rate * np.max(q_table[new_state, :]))))
        
        # Now we update our current state
        state = new_state
        # Adding the reward we received in current action on state
        rewards_current_episode += reward
        # If episode ends we jump to next episode else we transition to next time step
        if done == True:
            break
