In [9]:
# Importing requirments
import numpy as np
import gymnasium as gym
import random, time
from IPython.display import clear_output

In [10]:
# We are going to use frozen lake environment
# Let's create the environment
env = gym.make('FrozenLake-v1')

In [11]:
# Now let's create the Q-table and initialize all the values to zero for each state action pair
# No. of rows = size of the state space in the environment
# No. of columns = size of the action space in the environment
state_space_size = env.observation_space.n
action_space_size = env.action_space.n
print('State space size: ',state_space_size)
print('Action space size: ',action_space_size)

State space size:  16
Action space size:  4


In [12]:
# Now let's create the Q-table
q_table = np.zeros((state_space_size, action_space_size))
print('Q-table size: ',q_table.shape)
print(q_table)

Q-table size:  (16, 4)
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [13]:
# Let's initialize all the parameters 
num_episodes = 10000
max_step_per_episode = 1000

learning_rate = 0.1
discount_rate = 0.99

# These are exploration and exploitation trede-off
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.01

In [14]:
rewards_all_episode = []

# Let's implement Q-learning algorithm
# Everything that happens in a single episode
for episode in range(num_episodes):
    # Reset our environment to the starting state
    # env.reset() return two values 1) state number 2) probability
    state, _ = env.reset()
    # Keep track of whether the given episode is end or not
    done = False
    # Initially reward is 0
    rewards_current_episode = 0

    # Everything that happens in single time step of each episode
    for step in range(max_step_per_episode):
        # We generate random number to determin whether the agent will choose exploration or exploitation
        exploration_rate_threshold = random.uniform(0, 1)
        # Exploitation
        if exploration_rate_threshold > exploration_rate:
            # Choose the highest Q-value in the in the Q-table for the current state
            action = np.argmax(q_table[state,:])
        # Exploration
        else:
            # Explore the environment by sample an action randomly
            action = env.action_space.sample()
        
        # We take that action from that step and move to new step
        # It reterns new state, the reward for the action we took, action ended the episode or not, truncate the episode or not, information about environment
        new_state, reward, done, truncated, info = env.step(action=action)

        # After the reward we get from the action on the state we update Q-table for q(state, action)
        # This is the equation to find optimal Q-value
        q_table[state, action] = ((1 - learning_rate) * q_table[state, action]) + (learning_rate * (reward + (discount_rate * np.max(q_table[new_state, :]))))
        
        # Now we update our current state
        state = new_state
        # Adding the reward we received in current action on state
        rewards_current_episode += reward
        #############################################################
        # Total reward received after one episode is either 1 or 0
        #############################################################
        # If episode ends we jump to next episode else we transition to next time step
        # If done is true means the agent either in the hole or in the finish point
        if done == True:
            break
        
    # After one episode we need to update exploration rate
    # This is the formula of exploration rate update
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    rewards_all_episode.append(rewards_current_episode)

# Now we are going to calculate average rewards per 1000 episode
rewards_per_thousand_episode = np.split(np.array(rewards_all_episode),num_episodes/1000)
count = 1000
print('--Average reward per thousand episode--')
for r in rewards_per_thousand_episode:
    print('Episode: '+str(count) + ' = ' + str(sum(r)/1000))
    count = count+1000

--Average reward per thousand episode--
Episode: 1000 = 0.002
Episode: 2000 = 0.027
Episode: 3000 = 0.167
Episode: 4000 = 0.167
Episode: 5000 = 0.467
Episode: 6000 = 0.54
Episode: 7000 = 0.654
Episode: 8000 = 0.71
Episode: 9000 = 0.649
Episode: 10000 = 0.684


In [15]:
# Let's check the Updated Q-table
q_table

array([[0.53018123, 0.39706074, 0.45323669, 0.44705501],
       [0.21885202, 0.04451896, 0.03021897, 0.04063984],
       [0.2236835 , 0.05672989, 0.06676505, 0.03329265],
       [0.0893088 , 0.        , 0.        , 0.        ],
       [0.54998005, 0.31335808, 0.3462287 , 0.34691291],
       [0.        , 0.        , 0.        , 0.        ],
       [0.2411331 , 0.10174902, 0.07393345, 0.02633441],
       [0.        , 0.        , 0.        , 0.        ],
       [0.48459984, 0.3383393 , 0.34092106, 0.5664316 ],
       [0.35480684, 0.6115162 , 0.42708561, 0.41552159],
       [0.56086448, 0.32805231, 0.44763924, 0.24076294],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.3098029 , 0.45763026, 0.68644098, 0.38285131],
       [0.696587  , 0.80845331, 0.75327981, 0.77386604],
       [0.        , 0.        , 0.        , 0.        ]])

In [17]:
# Let's see how agent play the game
# We are going to watch three episode our agent play
# Create a new environment
env = gym.make('FrozenLake-v1', render_mode='human')

for episode in range(3):
    # Reset our environment first
    state, _ = env.reset()
    done = False
    # Print the episode what is starting and wait 1 sec so that we can see which episode it is
    print('-- Episode : ', episode + 1, '\n\n')
    time.sleep(1)

    # Now the inner loop
    for step in range(max_step_per_episode):
        # Clear the output and wait-until there is anther print
        clear_output(wait=True)
        # Render the environment
        env.render()
        # Sleep some time to see the current state and the agents position
        time.sleep(0.3)

        # Now take the highest Q-value from current state and take the action
        # Basically we travel from state to state using Q-table and Q-value
        action = np.argmax(q_table[state, :])
        new_state, reward, done, truncated, info = env.step(action=action)

        # If we reach the destination then we will stop
        if done == True:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print('-- We reached the goal --')
                time.sleep(3)
            else:
                print('-- Fail --')
                time.sleep(3)
            clear_output(wait=True)
            # If successfully reach the goal then we break the loop and go to next episode
            break
        
        state = new_state
# Close environment after all 3 episode
env.close()

-- We reached the goal --
