In [1]:
import gym
import pygame
import numpy as np
import custom_frozen_lake
import time

import random
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
from datetime import datetime

pygame.init(); 

In [2]:
def test_map_generation(manual: bool=False):
    for i in range(1):
        env = gym.make(id='CustomFrozenLake', desc = None, map_name = None, map_size = 6, frozen_p = 0.5)
        print(env.reset())
        env.render()
        while(manual):
            event = pygame.event.wait()
            if event.type == pygame.KEYDOWN:
                break
        if not manual: time.sleep(1)
    env.close()

In [3]:
def test_actions(manual: bool=False):
    env = gym.make(id='CustomFrozenLake', desc = None, map_name = None, map_size = 6, frozen_p = 0.6)
    env.reset()
    for i in range(20):
        env.render()
        while(manual):
            event = pygame.event.wait()
            if event.type == pygame.KEYDOWN:
                break
        if not manual: time.sleep(1)
        
        action = env.action_space.sample()
        obs, reward, done,_ = env.step(action)
        print(obs, reward, action)
        if done:
            break
    env.close()

In [4]:
# test_map_generation()
# test_actions(1)

In [5]:
def set_up():
    b_slip = True
    map_size = 6
    frozen_p = 0.7

    env = gym.make(id='CustomFrozenLake', is_slippery=b_slip, desc = None, map_name = None, map_size = map_size, frozen_p = frozen_p)
    action_size = env.action_space.n
    state_size = env.observation_space.n
    # state_size = (env.observation_space[0].n, env.observation_space[1].n)
    #print(action_size, state_size)
    # qtable = np.zeros(state_size + (action_size,))
    qtable = np.zeros((state_size, action_size))
    return (env, qtable)

In [6]:
def train_model(env: gym.Env, qtable: np.ndarray, manual: bool = False,
                total_episodes: int=20000, learning_rate: float=0.6, max_steps: int=200, gamma: float=0.6, 
                epsilon: float=1, max_epsilon: float=1, min_epsilon: float=0, decay_rate: float=0.00005) -> list:
    ep_reward = []
    rewards_1000 = []
    render_interval = total_episodes // 10
    win = 0
    for episode in range(total_episodes):
        # Reset the environment
        state = env.reset()
        step = 0
        done = False
        total_rewards = 0
        
        for step in range(max_steps):
            # Choose an action a in the current world state (s)
            ## First we randomize a number
            exp_exp_tradeoff = random.uniform(0, 1)
            
            ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
            if exp_exp_tradeoff > epsilon:
                action = np.argmax(qtable[state,:])

            # Else doing a random choice --> exploration
            else:
                action = env.action_space.sample()

            # Take the action (a) and observe the outcome state(s') and reward (r)
            new_state, reward, done, info = env.step(action)

            if episode % render_interval == 0:
                #env.render()
                while(manual):
                    event = pygame.event.wait()
                    if event.type == pygame.KEYDOWN:
                        break
                if not manual: time.sleep(0.5)

            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
            # qtable[new_state,:] : all the actions we can take from new state
            # print('QTABLE', qtable[state + (action,)])
            # print('QTABLE2', np.max(qtable[new_state, :]))
            # print('QTABLE3', qtable[state + (action,)])

            qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
            # qtable[(state,) + (action,)] = qtable[(state,) + (action,)] + learning_rate * (reward + gamma * np.max(qtable[new_state,:]) - qtable[(state,) + (action,)])
            # currentq = qtable[state + (action,)]
            # maxfutureq = np.max(qtable[new_state])
            # newq = (1-learning_rate)*currentq+learning_rate*(reward+gamma*maxfutureq)
            # qtable[state+(action,)] = newq


            total_rewards += reward
            if reward > 0: win += 1
            
            # Update state
            state = new_state

            # Finish episode if agent reaches reward or hole
            if done == True: 
                break
            
        # Reduce epsilon (because we need less and less exploration)
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
        ep_reward.append(total_rewards)

        env, _ = set_up()

    rewards_1000 = np.add.reduceat(ep_reward, np.arange(0, len(ep_reward), 1000))
    #print(rewards_1000[-1])
    #print(epsilon)
    #write_files(qtable, rewards_1000, total_episodes, learning_rate, gamma, min_epsilon, decay_rate)
    env.close()
    print(win)
    return qtable, rewards_1000

In [7]:
def test_model(env: gym.Env, qtable: np.ndarray, manual: bool=True, max_steps: int=100):
    total_reward = 0
    win = 0
    total_episodes = 100
    render_interval = total_episodes // 5
    
    for episode in range(total_episodes):
        state = env.reset()
        step = 0
        done = False
        print("****************************************************")
        print("EPISODE ", episode)

        for step in range(max_steps):
            # Take the action (index) that have the maximum expected future reward given that state
            action = np.argmax(qtable[state,:])
            
            new_state, reward, done, info = env.step(action)
            if episode % render_interval == 0:
                env.render()
                while(manual):
                    event = pygame.event.wait()
                    if event.type == pygame.KEYDOWN:
                        break
                if not manual: time.sleep(1)
            if done:
                # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)

                # We print the number of step it took.
                print("Number of steps", step + 1)
                print("Reward:", reward)
                total_reward += reward
                if reward > 0: win += 1
                break
            state = new_state
        print("Timed out", step + 1, "steps")
        env, _ = set_up()
    print('Total rewards:', total_reward)
    print('Success rate:', win,'/',total_episodes)
    env.close()

In [8]:
env, qtable = set_up()
total_episodes = 20000
min_epsilon, max_epsilon = 0.0, 1.0

# calculate decay_rate needed to achieve 90% exploit chance at the final episode
gamma = -(np.log((0.1 - min_epsilon) / (max_epsilon - min_epsilon))) / total_episodes

qtable, rewards_1000 = train_model(env, qtable, False, total_episodes, gamma=gamma)

env, _ = set_up()
print(qtable)
test_model(env, qtable, False)

3048
[[-1.21192841e-09 -1.77092629e-05 -2.54753025e-10 -3.95255950e-07]
 [-9.38618505e-01 -2.41071916e-01  2.91882073e-10 -6.19150131e-04]
 [-1.14378758e-01 -3.51815792e-01 -7.02134527e-01  6.79117593e-08]
 [-8.59455672e-01 -1.58927068e-01 -6.18131526e-03 -6.01558764e-01]
 [-5.42440172e-10 -4.46915587e-02 -3.53845746e-05 -4.18880877e-02]
 [-2.57995878e-01 -7.56332877e-01 -7.37260300e-01 -9.99835251e-01]
 [-8.43018864e-01 -9.98951070e-01 -9.84548509e-01 -7.41619195e-01]
 [-9.99988982e-01 -9.99976675e-01 -9.99992114e-01 -9.99993523e-01]
 [-1.58244859e-02  1.03150046e-07 -3.36220406e-01 -6.00229180e-01]
 [-8.62127070e-01 -8.78415859e-01 -6.07261878e-01 -9.99014299e-01]
 [-7.59575057e-01 -8.41082058e-01 -9.84213105e-01 -7.34494471e-01]
 [-9.99999996e-01 -9.04004764e-01 -9.61109470e-01 -9.98985813e-01]
 [-6.56201189e-01 -2.56367838e-01 -9.77909721e-01 -7.15458327e-01]
 [-9.99998268e-01 -6.55355921e-01 -1.00000411e+00 -9.99999975e-01]
 [ 5.37009120e-01 -3.96533734e-01 -9.99997192e-01 -9.8460

In [9]:
env, qtable = set_up()