In [63]:
import gym
import numpy as np
import pandas as pd
import random
from matplotlib import pyplot as plt
import time
from IPython.display import clear_output

env = gym.make("FrozenLake-v0")
action_space_size = env.action_space.n
state_space_size = env.observation_space.n


In [64]:
num_episodes = 10000
max_steps_per_episode = 100

lr = 0.1
gamma = 0.99 # discount factor, it quantifies how much importance we give for future rewards

## epsilon decay is actually an efficient mechanism which makes learning faster!
min_epsilon = 1e-3
max_epsilon = 0.9
epsilon_decay = 1e-3

# epsilon, exploration rate (allow random sampling) rate, from start it's 100% chance of exploration! 
epsilon = max_epsilon

#Q = np.zeros((state_space_size, action_space_size))


In [65]:
def frozen_lake(decay=False, epsilon=epsilon, num_episodes=num_episodes, split=1000):
    
    env = gym.make("FrozenLake-v0")
    action_space_size = env.action_space.n
    state_space_size = env.observation_space.n    
      
    Q = np.zeros((state_space_size, action_space_size))
    rewards_all_episodes = []
    epsilon_all_episodes = [epsilon]
    Q_all_episodes = []   
    
    for episode in range(num_episodes):
        
        state = env.reset()

        done = False
        rewards_current_episodes = 0       # increment reward per step

        for step in range(max_steps_per_episode):

            rand = random.uniform(0, 1)

            if rand > epsilon:
                # Exploitation: agent takes a look at Q table and selects action with the highest Q value
                action = np.argmax(Q[state,:])
            else:
                # Exploration: allows agent to explore new states that otherwise wouldn't be selected
                action = env.action_space.sample()

            new_state, reward, done, info = env.step(action)   ## there's a probability of going each way depending on the Q table!!, env.step(1) doesn't always lead to going down

            #update Q-table for Q(s,a)
            Q[state, action] = (1-lr) * Q[state, action] + lr * (reward + gamma*np.max(Q[new_state, :]))            
            
            state = new_state
            rewards_current_episodes += reward

            if done == True:
                break
                
                
        if decay is True:
            # as episodes grow, epsilon decreases, meaning less exploration and more exploitation
            epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay*episode)
            epsilon_all_episodes.append(epsilon)
        
        
        Q_all_episodes.append(Q.copy())
        rewards_all_episodes.append(rewards_current_episodes)
     
    
    
    rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/split)
    count = split
    for r in rewards_per_thousand_episodes:
        print(count, ":", str(sum(r/split)))
        
        count += split
    return rewards_all_episodes, Q, epsilon_all_episodes


In [66]:
rewards_all_episodes, Q, epsilon_all_episodes = frozen_lake(decay=False)


1000 : 0.024000000000000014
2000 : 0.02000000000000001
3000 : 0.015000000000000006
4000 : 0.014000000000000005
5000 : 0.017000000000000008
6000 : 0.017000000000000008
7000 : 0.015000000000000006
8000 : 0.02100000000000001
9000 : 0.015000000000000006
10000 : 0.013000000000000005


In [68]:
rewards_all_episodes, Q, epsilon_all_episodes = frozen_lake(decay=True)


1000 : 0.058000000000000045
2000 : 0.21600000000000016
3000 : 0.43200000000000033
4000 : 0.6230000000000004
5000 : 0.6650000000000005
6000 : 0.6870000000000005
7000 : 0.7000000000000005
8000 : 0.7480000000000006
9000 : 0.7330000000000005
10000 : 0.7390000000000005


In [61]:
%matplotlib inline
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display

def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='loop'))

In [71]:
env = gym.make("FrozenLake-v0")
EXPLOIT = False

for episode in range(10):
    state = env.reset()
    done = False
    print("*******EPISODDE ", episode+1, "*******\n\n\n\n")
    time.sleep(1)
    
    for step in range(50):
        clear_output(wait=True)
        env.render(mode='human')
        time.sleep(0.3)
        
        if EXPLOIT:
            action = np.argmax(Q[state, :])
        else:
            action = env.action_space.sample()
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("******You reached the goal!******")
                time.sleep(3)
                
            else:
                print("******You fell through a hole!******")
                time.sleep(3)
            clear_output(wait=True)
            break
        
        state = new_state
env.close()

  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
******You fell through a hole!******


In [70]:
env = gym.make("FrozenLake-v0")
EXPLOIT = True

for episode in range(10):
    state = env.reset()
    done = False
    print("*******EPISODDE ", episode+1, "*******\n\n\n\n")
    time.sleep(1)
    
    for step in range(50):
        clear_output(wait=True)
        env.render(mode='human')
        time.sleep(0.3)
        
        if EXPLOIT:
            action = np.argmax(Q[state, :])
        else:
            action = env.action_space.sample()
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("******You reached the goal!******")
                time.sleep(3)
                
            else:
                print("******You fell through a hole!******")
                time.sleep(3)
            clear_output(wait=True)
            break
        
        state = new_state
env.close()

  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
