# FrozenLake

In [1]:
import gym
import numpy as np
import random

In [116]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.8196, # optimum = .8196, changing this seems have no influence
)

## 1. Create environment from gym

In [147]:
env = gym.make("FrozenLakeNotSlippery-v0")

In [148]:
#env = gym.make("Taxi-v2")

In [149]:
env.observation_space

Discrete(16)

In [150]:
env.action_space

Discrete(4)

## 2. Create Q-table

In [151]:
observation_size = env.observation_space.n
action_size = env.action_space.n

In [152]:
q_table = np.zeros((observation_size,action_size))

In [153]:
q_table.shape

(16, 4)

## 3. Initialize hyperparameters

Value-based reinforcement learning parameters

In [154]:
alpha = 0.8 # Learning rate
gamma = 0.95 # Discounted rate
total_episodes = 20000 # Train times
max_steps = 99 # Max step for each episodes

Exploration parameters

In [155]:
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.001
decay_rate = 0.001

## 4. Implement the q-learning algorithm

In [156]:
reward_record = [] # a list to store each episode score

for episode in range(total_episodes):
    state = env.reset()
    done=False
    step = 0
    current_reward = 0
    
    for step in range(max_steps):
        # within max_step
        # do exploration
        exploration = random.uniform(0,1)
        if exploration < epsilon:
            # take next step randomly
            action = env.action_space.sample()
        else:
            # take action base on q-table
            action = np.argmax(q_table[state,:])
        
        # check observation of next state
        new_state, reward, done, info = env.step(action)
        
        # udpate q-table
        q_table[state][action] = q_table[state][action] + alpha * (reward + gamma * np.max(q_table[new_state,:]) - q_table[state][action])
        
        # sum reward
        current_reward += reward
        
        # move to next state
        state = new_state
        
        if done == True:
            break
    
    # reduce the exploration rate
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate*episode)
    
    # record the reward for this episode
    reward_record.append(current_reward)
    


In [157]:
print ("Score over time: " +  str(np.sum(reward_record)/total_episodes))
print(q_table)
print(epsilon)

Score over time: 0.9399
[[0.73509189 0.77378094 0.77378094 0.73509189]
 [0.73509189 0.         0.81450625 0.77378094]
 [0.77378094 0.857375   0.77378094 0.81450625]
 [0.81450625 0.         0.77378092 0.77378094]
 [0.77378094 0.81450625 0.         0.73509189]
 [0.         0.         0.         0.        ]
 [0.         0.9025     0.         0.81450625]
 [0.         0.         0.         0.        ]
 [0.81450625 0.         0.857375   0.77378094]
 [0.81450625 0.9025     0.9025     0.        ]
 [0.857375   0.95       0.         0.857375  ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9025     0.95       0.857375  ]
 [0.9025     0.95       1.         0.9025    ]
 [0.         0.         0.         0.        ]]
0.0010000020611525913


In [158]:
env.reset()
env.render()
print(np.argmax(q_table,axis=1).reshape(4,4))


[41mS[0mFFF
FHFH
FFFH
HFFG
[[1 2 1 0]
 [1 0 1 0]
 [2 1 1 0]
 [0 2 2 0]]
