In [2]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [3]:
env = gym.make('FrozenLake-v1', render_mode='ansi')

In [4]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

In [5]:
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [7]:
#Parameters 

num_episodes = 10000 # total episodes we want our agent to play during training
max_steps_per_episode = 100 # max steps within a single episode

learning_rate = 0.1 # alpha
discount_rate = 0.99 # ghamma

# expliration exploitation trade-off -> epsilon greedy strategy
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [8]:
rewards_all_episodes = []

In [9]:
# Q-learning algorithm
for episode in range(num_episodes):

    # initialize new episode params
    state = env.reset()[0] #reset the state of the environment to the start state
    done = False  # whether or not our episode is finished
    rewards_current_episode = 0 # current reward


    for step in range(max_steps_per_episode): 

        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])  #exploitation -> choosing best action from the state
        else:
            action = env.action_space.sample() #exploration -> choosing random action
            
        # Take new action
        new_state, reward, done, truncated, info = env.step(action)


        # Update Q-table: q_new = (1-alpha) * q_old + alpha * learned value

        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
    learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))


        # Set new state
        state = new_state
        # Add new reward        
        rewards_current_episode += reward 

        if done == True: 
            break
    

    # Exploration rate decay   
    exploration_rate = min_exploration_rate + \
    (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)

    # Add current episode reward to total rewards list
    rewards_all_episodes.append(rewards_current_episode)

In [10]:
 # Calculate and print the average reward per thousand episodes
 
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

********Average reward per thousand episodes********

1000 :  0.046000000000000034
2000 :  0.20300000000000015
3000 :  0.4100000000000003
4000 :  0.5320000000000004
5000 :  0.6170000000000004
6000 :  0.6420000000000005
7000 :  0.6820000000000005
8000 :  0.6610000000000005
9000 :  0.6520000000000005
10000 :  0.6930000000000005


In [11]:
# Print updated Q-table
print("\n\n********Q-table********\n")
print(q_table)



********Q-table********

[[0.53025584 0.49614817 0.49067116 0.49843155]
 [0.38095481 0.34005618 0.26894342 0.47611165]
 [0.39301398 0.42192569 0.40816736 0.45021877]
 [0.29774538 0.2830519  0.35710602 0.43659814]
 [0.55060848 0.32934277 0.43744586 0.3995253 ]
 [0.         0.         0.         0.        ]
 [0.15870434 0.17054341 0.33555668 0.13510031]
 [0.         0.         0.         0.        ]
 [0.41749037 0.36947641 0.46786723 0.57080193]
 [0.35441462 0.60813461 0.51165789 0.35041632]
 [0.476437   0.40718055 0.27469305 0.36828375]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.54338761 0.62833597 0.72569041 0.52833204]
 [0.72819721 0.8909082  0.76951668 0.76689547]
 [0.         0.         0.         0.        ]]
