# Frozen Lake

https://www.gymlibrary.dev/environments/toy_text/frozen_lake/

In [127]:
import random
import time
import numpy as np
import matplotlib.pyplot as plt
import gym
from env_video_recorder import EnvVideoRecorder

In [128]:
np.set_printoptions(precision=3)

In [129]:
SLIPPERY = False

In [130]:
env = gym.make('FrozenLake-v1', render_mode='ansi', is_slippery=SLIPPERY)

In [131]:
rec = EnvVideoRecorder(env)

In [132]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [133]:
state_counts = [0] * state_space_size
state_counts

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [134]:
exploration_count = 0
exploitation_count = 0
last_reward_sum = 0 # this is for step episode by eposode obs

In [135]:
num_episodes = 10_000
max_steps_per_episode = 100

learning_rate = 0.01
discount_rate = 0.98

exploration_rate = 1
max_exploration_rate = 0.9
min_exploration_rate = 0.05
exploration_decay_rate = 0.0005

In [136]:
rewards_all_episodes = []

# Q-learning algorithm

for episode in range(num_episodes):
    state, _ = env.reset()
    
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        # Exmploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            exploitation_count += 1
            action = np.argmax(q_table[state,:])
        else:
            exploration_count += 1
            action = env.action_space.sample()
            
        new_state, reward, done, truncated, info = env.step(action)
                
        # Update Q-table for Q(s,a)
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))
        
        state = new_state
        state_counts[state] += 1
        rewards_current_episode += reward
        
        if done:
            break
            
    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
        
    rewards_all_episodes.append(rewards_current_episode)

_ = """    
    if np.sum(rewards_all_episodes) > last_reward_sum:
        last_reward_sum = np.sum(rewards_all_episodes)
        print(episode, state, action, last_reward_sum)
        print(q_table)
        input('press enter to continue')
"""

In [137]:
# Calculate and print the average reward per thousand episodes

rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes / 1_000)
count = 1_000

print('*** Average reward per thousand episodes ***')
for r in rewards_per_thousand_episodes:
    print(count, ':', str(sum(r/1_000)))
    count += 1_000
    
print('\n\n\n*** Q-table ***')
print(q_table)

print('\n\n\n*** State counts ***')
print(state_counts)

print('\n\n\n*** Exploration/Exploitation count ***')
print(exploration_count, exploitation_count)

*** Average reward per thousand episodes ***
1000 : 0.08400000000000006
2000 : 0.19300000000000014
3000 : 0.34000000000000025
4000 : 0.4280000000000003
5000 : 0.5130000000000003
6000 : 0.6330000000000005
7000 : 0.6610000000000005
8000 : 0.7660000000000006
9000 : 0.7790000000000006
10000 : 0.8010000000000006



*** Q-table ***
[[0.885 0.881 0.904 0.885]
 [0.885 0.    0.922 0.903]
 [0.903 0.941 0.897 0.922]
 [0.921 0.    0.412 0.47 ]
 [0.414 0.914 0.    0.642]
 [0.    0.    0.    0.   ]
 [0.    0.96  0.    0.92 ]
 [0.    0.    0.    0.   ]
 [0.51  0.    0.94  0.415]
 [0.582 0.591 0.96  0.   ]
 [0.936 0.98  0.    0.938]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.429 0.975 0.417]
 [0.933 0.974 1.    0.955]
 [0.    0.    0.    0.   ]]



*** State counts ***
[5319, 12401, 11294, 1594, 2498, 2691, 8518, 992, 1873, 1911, 7960, 749, 370, 916, 6930, 5198]



*** Exploration/Exploitation count ***
30460 40754


`q_table`: 0=left, 1=down, 2=right, 3=up
why is the second-last row (the space left of the goal) favoring down?
Because it doesn't land on the goal enough to up the score?
Has to do with the exploration/exploitation bit? Why is that not working?

In [138]:
env.close()

In [139]:
env = gym.make('FrozenLake-v1', render_mode='rgb_array', is_slippery=SLIPPERY)
rec = EnvVideoRecorder(env)
rec.reset()
rec.render()

array([[[180, 200, 230],
        [180, 200, 230],
        [180, 200, 230],
        ...,
        [180, 200, 230],
        [180, 200, 230],
        [180, 200, 230]],

       [[180, 200, 230],
        [204, 230, 255],
        [204, 230, 255],
        ...,
        [204, 230, 255],
        [204, 230, 255],
        [180, 200, 230]],

       [[180, 200, 230],
        [235, 245, 249],
        [204, 230, 255],
        ...,
        [204, 230, 255],
        [204, 230, 255],
        [180, 200, 230]],

       ...,

       [[180, 200, 230],
        [235, 245, 249],
        [235, 245, 249],
        ...,
        [204, 230, 255],
        [235, 245, 249],
        [180, 200, 230]],

       [[180, 200, 230],
        [235, 245, 249],
        [235, 245, 249],
        ...,
        [204, 230, 255],
        [204, 230, 255],
        [180, 200, 230]],

       [[180, 200, 230],
        [180, 200, 230],
        [180, 200, 230],
        ...,
        [180, 200, 230],
        [180, 200, 230],
        [180, 200, 230]]

In [140]:
for episode in range(10):
    state, _ = rec.reset()
    done = False
    
    print(f'*** EPISODE {episode} ***')
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        rec.render()
        
        #time.sleep(0.1)
        
        action = np.argmax(q_table[state,:])
        print(f'{action}', end='')
        
        new_state, reward, done, truncated, info = rec.step(action)
        
        if done:
            rec.render()
            if reward == 1:
                print('\nYEEAAHH!!')
            else:
                print('\nBUUUUH!!!')
            #time.sleep(1)
            break
            
        state = new_state

rec.save('video/frozen-lake.mp4')

*** EPISODE 0 ***
221112
YEEAAHH!!
*** EPISODE 1 ***
221112
YEEAAHH!!
*** EPISODE 2 ***
221112
YEEAAHH!!
*** EPISODE 3 ***
221112
YEEAAHH!!
*** EPISODE 4 ***
221112
YEEAAHH!!
*** EPISODE 5 ***
221112
YEEAAHH!!
*** EPISODE 6 ***
221112
YEEAAHH!!
*** EPISODE 7 ***
221112
YEEAAHH!!
*** EPISODE 8 ***
221112
YEEAAHH!!
*** EPISODE 9 ***
221112
YEEAAHH!!
