# Frozen Lake environment

*Winter is here. You and your friends were tossing around a frisbee at the park when you made a wild throw that left the frisbee out in the middle of the lake. The water is mostly frozen, but there are a few holes where the ice has melted. If you step into one of those holes, you'll fall into the freezing water. At this time, there's an international frisbee shortage, so it's absolutely imperative that you navigate across the lake and retrieve the disc. However, the ice is slippery, so you won't always move in the direction you intend.*

The surface is described using a grid like the following:

**SFFF**       (S: starting point, safe)

**FHFH**       (F: frozen surface, safe)

**FFFH**      (H: hole, fall to your doom)

**HFFG**       (G: goal, where the frisbee is located)





In [0]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [36]:
env = gym.make("FrozenLake-v0")

action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(state_space_size,action_space_size)
print(q_table)


16 4
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


**Without training**

In [15]:
for episode in range(3):
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    for step in range(100):        
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = env.action_space.sample()        
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
                clear_output(wait=True)
            break
        state = new_state

  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
****You fell through a hole!****


**Training**

In [0]:
# initialize new episode params
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

rewards_all_episodes = []

In [0]:
for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0
    for step in range(max_steps_per_episode): 

        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:]) 
        else:
            action = env.action_space.sample()
        new_state, reward, done, info = env.step(action)
    
        # Update Q-table for Q(s,a)
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
        learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
    
        state = new_state
        rewards_current_episode += reward 
        
        if done == True: 
            break
    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
    (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    rewards_all_episodes.append(rewards_current_episode)

In [39]:
rewards_per_thosand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thosand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000
#check learned q_table    
print (q_table)    

********Average reward per thousand episodes********

1000 :  0.04100000000000003
2000 :  0.21400000000000016
3000 :  0.3870000000000003
4000 :  0.5940000000000004
5000 :  0.6600000000000005
6000 :  0.6790000000000005
7000 :  0.6970000000000005
8000 :  0.6480000000000005
9000 :  0.6820000000000005
10000 :  0.7070000000000005
[[0.60516995 0.5386453  0.52408858 0.51957365]
 [0.36906431 0.42897573 0.28802314 0.54637556]
 [0.42749444 0.4300228  0.42792291 0.49349829]
 [0.22872645 0.27780515 0.31199003 0.4568145 ]
 [0.62821896 0.34605081 0.40819195 0.37774986]
 [0.         0.         0.         0.        ]
 [0.15883225 0.11373592 0.48072785 0.15307569]
 [0.         0.         0.         0.        ]
 [0.46820115 0.4226666  0.44646938 0.68243833]
 [0.42833862 0.76187934 0.50419697 0.37672503]
 [0.7694025  0.38931128 0.42103453 0.33145846]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.39430926 0.60968332 0.81070886 0.39377397]
 [0.75984402 0.

**Play**

In [40]:
for episode in range(3):
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    for step in range(max_steps_per_episode):        
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state,:])        
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
                clear_output(wait=True)
            break
        state = new_state

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
****You reached the goal!****
