In [1]:
import gym
import pygame
import numpy as np
import custom_frozen_lake
import time

import random
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
from datetime import datetime

pygame.init(); 

DIR_STATE_FLAG = True

In [2]:
def test_map_generation(manual: bool=False):
    for i in range(1):
        env = gym.make(id='CustomFrozenLake', desc = None, map_name = None, map_size = 6, frozen_p = 0.5)
        print(env.reset())
        env.render()
        while(manual):
            event = pygame.event.wait()
            if event.type == pygame.KEYDOWN:
                break
        if not manual: time.sleep(1)
    env.close()

In [3]:
def test_actions(manual: bool=False):
    env = gym.make(id='CustomFrozenLake', desc = None, map_name = None, map_size = 6, frozen_p = 0.6)
    env.reset()
    for i in range(20):
        env.render()
        while(manual):
            event = pygame.event.wait()
            if event.type == pygame.KEYDOWN:
                break
        if not manual: time.sleep(1)
        
        action = env.action_space.sample()
        obs, reward, done,_ = env.step(action)
        print(obs, reward, action)
        if done:
            break
    env.close()

In [4]:
# test_map_generation()
# test_actions(1)

In [10]:
def set_up():
    b_slip = True
    map_size = 4
    frozen_p = 0.8

    env = gym.make(id='CustomFrozenLake', is_slippery=b_slip, desc = None, map_name = None, map_size = map_size, frozen_p = frozen_p)
    action_size = env.action_space.n

    if DIR_STATE_FLAG: state_size = (env.observation_space[0].n, env.observation_space[1].n) 
    else: state_size = env.observation_space.n

    if DIR_STATE_FLAG: qtable = np.zeros(state_size + (action_size,))
    else: qtable = np.zeros((state_size, action_size))
    #print(action_size, state_size)
    return (env, qtable)

In [6]:
def train_model(env: gym.Env, qtable: np.ndarray, manual: bool = False,
                total_episodes: int=20000, learning_rate: float=0.6, max_steps: int=200, gamma: float=0.6, 
                epsilon: float=1, max_epsilon: float=1, min_epsilon: float=0, decay_rate: float=0.00005) -> list:
    ep_reward = []
    rewards_1000 = []
    render_interval = total_episodes // 10
    win = 0
    for episode in range(total_episodes):
        # Reset the environment
        state = env.reset()
        step = 0
        done = False
        total_rewards = 0
        
        for step in range(max_steps):
            # Choose an action a in the current world state (s)
            ## First we randomize a number
            exp_exp_tradeoff = random.uniform(0, 1)
            
            ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
            if exp_exp_tradeoff > epsilon:
                action = np.argmax(qtable[state])
                # print(episode, step, state, action)
            # Else doing a random choice --> exploration
            else:
                action = env.action_space.sample()

            # Take the action (a) and observe the outcome state(s') and reward (r)
            new_state, reward, done, info = env.step(action)

            if episode % render_interval == 0:
                #env.render()
                while(manual):
                    event = pygame.event.wait()
                    if event.type == pygame.KEYDOWN:
                        break
                if not manual: time.sleep(0.5)

            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
            # qtable[new_state,:] : all the actions we can take from new state

            # print(episode, step, state, action)
            if DIR_STATE_FLAG: qtable[state + (action,)] = qtable[state + (action,)] + learning_rate * (reward + gamma * np.max(qtable[new_state]) - qtable[state + (action,)])
            else: qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])

            total_rewards += reward
            if reward > 0: win += 1
            
            # Update state
            state = new_state

            # Finish episode if agent reaches reward or hole
            if done == True: 
                break
            
        # Reduce epsilon (because we need less and less exploration)
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
        ep_reward.append(total_rewards)

        env, _ = set_up()

    rewards_1000 = np.add.reduceat(ep_reward, np.arange(0, len(ep_reward), 1000))
    #print(rewards_1000[-1])
    #print(epsilon)
    #write_files(qtable, rewards_1000, total_episodes, learning_rate, gamma, min_epsilon, decay_rate)
    env.close()
    print(win)
    return qtable, rewards_1000

In [7]:
def test_model(env: gym.Env, qtable: np.ndarray, manual: bool=True, max_steps: int=100):
    total_reward = 0
    win = 0
    total_episodes = 100
    render_interval = total_episodes // 5
    
    for episode in range(total_episodes):
        state = env.reset()
        step = 0
        done = False
        print("****************************************************")
        print("EPISODE ", episode)

        for step in range(max_steps):
            # Take the action (index) that have the maximum expected future reward given that state
            action = np.argmax(qtable[state])
            # print(episode, step, state, action)
            
            new_state, reward, done, info = env.step(action)
            if episode % render_interval == 0:
                env.render()
                while(manual):
                    event = pygame.event.wait()
                    if event.type == pygame.KEYDOWN:
                        break
                if not manual: time.sleep(.3)
            if done:
                # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)

                # We print the number of step it took.
                print("Number of steps", step + 1)
                print("Reward:", reward)
                total_reward += reward
                if reward > 0: win += 1
                break
            state = new_state
        if not done: print("Timed out", step + 1, "steps")
        env, _ = set_up()
    print('Total rewards:', total_reward)
    print('Success rate:', win,'/',total_episodes)
    env.close()

In [8]:
env, qtable = set_up()
total_episodes = 50000
min_epsilon, max_epsilon = 0.0, 1.0

# calculate decay_rate needed to achieve 90% exploit chance at the final episode
gamma = -(np.log((0.1 - min_epsilon) / (max_epsilon - min_epsilon))) / total_episodes

qtable, rewards_1000 = train_model(env, qtable, False, total_episodes, gamma=gamma)
print(rewards_1000)
env, _ = set_up()
print(qtable)
test_model(env, qtable, False)

14177
[ 58.5  79.3  85.9  92.5  86.   94.7 103.5  88.1 130.1 133.2 135.4 137.7
 130.  145.4 179.4 138.7 166.4 176.4 167.4 167.5 197.1 211.4 177.9 196.2
 228.  223.7 221.7 221.6 224.9 236.9 248.7 247.5 249.9 243.  283.5 254.8
 274.5 322.6 298.6 296.3 297.6 300.1 340.5 293.3 316.8 350.8 315.3 355.8
 348.7 361.7]
[[[-3.71243466e-07 -8.25321914e-07 -6.03826665e-08  6.00002653e-01]
  [ 2.72571419e-07 -1.84732956e-06  4.58113318e-10 -2.32033065e-06]
  [-4.17672432e-11 -4.35259289e-08  2.70199079e-08  1.00655204e-05]
  [-1.21342679e-09 -1.49893384e-09 -2.49675303e-06 -3.26422093e-07]]

 [[-4.19982001e-03 -2.43528075e-02  6.64230467e-06 -9.65356939e-03]
  [-5.41367406e-02 -6.06783151e-02  2.82265986e-06 -4.49646431e-03]
  [-2.58225008e-02 -2.78421115e-02 -2.96413155e-06 -3.93288654e-02]
  [-8.58821187e-02 -3.37469099e-02  1.69810890e-07 -4.61404324e-03]]

 [[-1.57530405e-03 -2.79397237e-02 -3.60224498e-04  1.65793096e-05]
  [-2.55360299e-02 -9.94490945e-03 -7.13107603e-04  3.90215130e-08]
  [-

In [11]:
env, _ = set_up()
print(qtable)
test_model(env, qtable, False)

[[[-3.71243466e-07 -8.25321914e-07 -6.03826665e-08  6.00002653e-01]
  [ 2.72571419e-07 -1.84732956e-06  4.58113318e-10 -2.32033065e-06]
  [-4.17672432e-11 -4.35259289e-08  2.70199079e-08  1.00655204e-05]
  [-1.21342679e-09 -1.49893384e-09 -2.49675303e-06 -3.26422093e-07]]

 [[-4.19982001e-03 -2.43528075e-02  6.64230467e-06 -9.65356939e-03]
  [-5.41367406e-02 -6.06783151e-02  2.82265986e-06 -4.49646431e-03]
  [-2.58225008e-02 -2.78421115e-02 -2.96413155e-06 -3.93288654e-02]
  [-8.58821187e-02 -3.37469099e-02  1.69810890e-07 -4.61404324e-03]]

 [[-1.57530405e-03 -2.79397237e-02 -3.60224498e-04  1.65793096e-05]
  [-2.55360299e-02 -9.94490945e-03 -7.13107603e-04  3.90215130e-08]
  [-8.58872593e-02 -7.13746964e-04 -1.40578858e-02  6.64587089e-06]
  [-1.51776396e-04 -8.43275068e-02 -1.23306325e-04  3.40368263e-12]]

 [[-9.52601567e-02 -9.60522107e-02 -8.55608362e-02 -8.60217025e-04]
  [-9.84245281e-02 -9.58944601e-02 -9.02606441e-02 -8.48602299e-02]
  [-9.98617326e-02 -9.61599774e-02 -8.4270

In [12]:
np.save('custom_test.npy' ,qtable)

In [14]:
qload = np.load('custom_test.npy')