## Libraries

In [1]:
import gym
import random
import numpy as np
import time

## Arguments

In [2]:
expl = 1; min_expl = 0.01; max_expl = 1
learning_rate = 0.8
gamma = 0.95
expl_decay_rate = 0.005
max_t = 99; max_eps = 1000; max_epochs = 15

## Initialization

In [3]:
render = False; LOG = False
env = gym.make("FrozenLake-v1")
q = np.zeros([env.observation_space.n, env.action_space.n]) # Q-value matrix

## Training

In [4]:
rewards = []

for epoch in range(max_epochs):
    
    total_epoch_rewards = 0
    
    for episode in range(max_eps):
        
        obs = env.reset()
        done = False
        t = 0
        
        while True:
            t += 1
            if render:
                env.render()
            # Randomly choose whether to explore or exploit based on probability expl
            if random.uniform(0,1) > expl:
                action = q[obs].argmax() # Exploit
            else:
                action = env.action_space.sample() # Explore

            new_obs, reward, done, info = env.step(action)
            # New Q value of state S and action A is equal to some of the old Q value
            # and the rest of the next Q value times gamma plus reward
            q[obs, action] = (1-learning_rate) * q[obs,action] + learning_rate * (reward + gamma*np.max(q[new_obs]))
            obs = new_obs

            if LOG: print(f"Action: {action},   Observation: {obs},   Reward: {reward}")
            if done:
                if LOG: 
                    print(f"! ---------------- GAME OVER ---------------- !")
                    if reward == 1: print("! ---------------- YOU WON ---------------- !")
                break
                
        total_epoch_rewards += reward
        expl = min_expl + (max_expl - min_expl)*np.exp(-expl_decay_rate*(episode+1)*(epoch+1)) 
#         expl -= expl_decay_rate; expl = max(min(expl, max_expl), min_expl)

    rewards.append(total_epoch_rewards)
    # Improvement every epoch
    print ("-------------- Score over time --------------")
    print (f"Score for this epoch: {rewards[epoch]*100/max_eps}% success rate")

    # The percentage success rate is expected be worse during training due to exploration i.e. random action

-------------- Score over time --------------
Score for this epoch: 17.9% success rate
-------------- Score over time --------------
Score for this epoch: 31.2% success rate
-------------- Score over time --------------
Score for this epoch: 36.7% success rate
-------------- Score over time --------------
Score for this epoch: 44.2% success rate
-------------- Score over time --------------
Score for this epoch: 38.9% success rate
-------------- Score over time --------------
Score for this epoch: 48.7% success rate
-------------- Score over time --------------
Score for this epoch: 43.4% success rate
-------------- Score over time --------------
Score for this epoch: 52.0% success rate
-------------- Score over time --------------
Score for this epoch: 44.0% success rate
-------------- Score over time --------------
Score for this epoch: 45.8% success rate
-------------- Score over time --------------
Score for this epoch: 44.0% success rate
-------------- Score over time ------------

In [5]:
# Resulted weights (Q-values)
print(f"------------ NEW MATRIX ------------ \n {q}")
time.sleep(5)

------------ NEW MATRIX ------------ 
 [[2.55262413e-01 1.02148042e-01 2.30091199e-01 9.02824539e-02]
 [1.36348608e-02 8.88763786e-03 2.55470230e-03 3.05152470e-01]
 [6.86141886e-03 4.97175663e-02 1.39986979e-02 2.62598746e-01]
 [7.18517817e-04 1.20253883e-03 7.97635689e-04 1.18684992e-01]
 [3.74958048e-01 1.16276348e-02 1.54936395e-03 2.13141095e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.24474492e-05 4.16912426e-06 2.17537696e-01 2.86595148e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.33985885e-02 6.30755948e-04 1.26496439e-02 3.49547613e-01]
 [2.53406695e-04 2.46218561e-01 7.63963578e-04 1.63145019e-02]
 [7.76579616e-01 2.65439681e-04 1.23402476e-03 2.22147848e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.30270596e-01 6.20280730e-01 8.57183468e-01 9.26525249e-02]
 [1.60044519e-01 9.84626561e-01 2.72438867e-01 2.28228929e-01]
 [0.00000000e+00

### Test

In [6]:
env.reset()

total_rewards = 0

for episode in range(1000):
    obs = env.reset()
    done = False

    while True:

        action = q[obs].argmax()

        new_obs, reward, done, info = env.step(action)

        obs = new_obs

        if done:
            break

    total_rewards += reward

# Improvement every epoch
print ("          -------------- Score --------------")
print (f"Percentage score: {total_rewards/1000*100}%")

          -------------- Score --------------
Percentage score: 72.6%


## Simulate

In [9]:
obs = env.reset()
t = 0

while True:
    t += 1
    env.render()
    action = q[obs].argmax() 
    obs, reward, done, info = env.step(action)
    print(f"Action: {action},   Observation: {obs},   Reward: {reward}")
    time.sleep(1)
    if done:
        print(f"! ---------------- GAME OVER ---------------- !")
        if reward == 1: print("! ----------------- YOU WON ----------------- !")
        break


[41mS[0mFFF
FHFH
FFFH
HFFG
Action: 0,   Observation: 0,   Reward: 0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Action: 0,   Observation: 0,   Reward: 0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Action: 0,   Observation: 4,   Reward: 0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
Action: 0,   Observation: 0,   Reward: 0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Action: 0,   Observation: 0,   Reward: 0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Action: 0,   Observation: 0,   Reward: 0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Action: 0,   Observation: 4,   Reward: 0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
Action: 0,   Observation: 4,   Reward: 0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
Action: 0,   Observation: 0,   Reward: 0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Action: 0,   Observation: 0,   Reward: 0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Action: 0,   Observation: 4,   Reward: 0.0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
Action: 0,   Observation: 0,   Reward: 0.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFF