In [1]:
# Importing requirments
import numpy as np
import gymnasium as gym
import random, time
from IPython.display import clear_output

In [2]:
# We are going to use frozen lake environment
# Let's create the environment
env = gym.make('FrozenLake-v1')

In [3]:
# Now let's create the Q-table and initialize all the values to zero for each state action pair
# No. of rows = size of the state space in the environment
# No. of columns = size of the action space in the environment
state_space_size = env.observation_space.n
action_space_size = env.action_space.n
print('State space size: ',state_space_size)
print('Action space size: ',action_space_size)

State space size:  16
Action space size:  4


In [4]:
# Now let's create the Q-table
q_table = np.zeros((state_space_size, action_space_size))
print('Q-table size: ',q_table.shape)
print(q_table)

Q-table size:  (16, 4)
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
# Let's initialize all the parameters 
num_episodes = 10000
max_step_per_episode = 1000

learning_rate = 0.1
discount_rate = 0.99

# These are exploration and exploitation trede-off
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.01

In [26]:
rewards_all_episode = []

# Let's implement Q-learning algorithm
# Everything that happens in a single episode
for episode in range(num_episodes):
    # Reset our environment to the starting state
    # env.reset() return two values 1) state number 2) probability
    state, _ = env.reset()
    # Keep track of whether the given episode is end or not
    done = False
    # Initially reward is 0
    rewards_current_episode = 0

    # Everything that happens in single time step of each episode
    for step in range(max_step_per_episode):
        # We generate random number to determin whether the agent will choose exploration or exploitation
        exploration_rate_threshold = random.uniform(0, 1)
        # Exploitation
        if exploration_rate_threshold > exploration_rate:
            # Choose the highest Q-value in the in the Q-table for the current state
            action = np.argmax(q_table[state,:])
        # Exploration
        else:
            # Explore the environment by sample an action randomly
            action = env.action_space.sample()
        
        # We take that action from that step and move to new step
        # It reterns new state, the reward for the action we took, action ended the episode or not, truncate the episode or not, information about environment
        new_state, reward, done, truncated, info = env.step(action=action)

        # After the reward we get from the action on the state we update Q-table for q(state, action)
        # This is the equation to find optimal Q-value
        q_table[state, action] = ((1 - learning_rate) * q_table[state, action]) + (learning_rate * (reward + (discount_rate * np.max(q_table[new_state, :]))))
        
        # Now we update our current state
        state = new_state
        # Adding the reward we received in current action on state
        rewards_current_episode += reward
        #############################################################
        # Total reward received after one episode is either 1 or 0
        #############################################################
        # If episode ends we jump to next episode else we transition to next time step
        if done == True:
            break
        
    # After one episode we need to update exploration rate
    # This is the formula of exploration rate update
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    rewards_all_episode.append(rewards_current_episode)

# Now we are going to calculate average rewards per 1000 episode
rewards_per_thousand_episode = np.split(np.array(rewards_all_episode),num_episodes/1000)
count = 1000
print('--Average reward per thousand episode--')
for r in rewards_per_thousand_episode:
    print('Episode: '+str(count) + ' = ' + str(sum(r)/1000))
    count = count+1000

-Average reward per thousand episode-
Episode: 1000 = 0.044
Episode: 2000 = 0.071
Episode: 3000 = 0.156
Episode: 4000 = 0.161
Episode: 5000 = 0.149
Episode: 6000 = 0.198
Episode: 7000 = 0.456
Episode: 8000 = 0.658
Episode: 9000 = 0.759
Episode: 10000 = 0.739


In [28]:
# Let's check the Updated Q-table
q_table

array([[0.45657599, 0.44572912, 0.44681273, 0.44828036],
       [0.2089147 , 0.23220831, 0.28227633, 0.42670779],
       [0.29469577, 0.39713293, 0.30276831, 0.41075973],
       [0.20382094, 0.22200752, 0.34252044, 0.40463151],
       [0.48753399, 0.40822328, 0.36777172, 0.35859465],
       [0.        , 0.        , 0.        , 0.        ],
       [0.05026515, 0.13332285, 0.19038296, 0.04315909],
       [0.        , 0.        , 0.        , 0.        ],
       [0.28820412, 0.46259925, 0.43957614, 0.52092617],
       [0.2532728 , 0.57704676, 0.43433175, 0.38046599],
       [0.53251035, 0.38199154, 0.33237429, 0.28683216],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.43236974, 0.26070073, 0.65853183, 0.50693572],
       [0.70353309, 0.81352105, 0.77120683, 0.72006364],
       [0.        , 0.        , 0.        , 0.        ]])