# Q-learning with epsilon-greedy exploration
On Frozen lake; https://gym.openai.com/envs/#toy_text

In [1]:
import gym
from gym.envs.toy_text import frozen_lake, discrete
from gym.envs.registration import register

register(
    id='Deterministic-8x8-FrozenLake-v0',
    entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
    kwargs={'map_name': '8x8',
            'is_slippery': False})

register(
    id='Stochastic-8x8-FrozenLake-v0',
    entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
    kwargs={'map_name': '8x8',
            'is_slippery': True})

In [2]:
import numpy as np

In [3]:
# Set policy to be epsilon-greedy with regards to Q
def sample_policy(Q, policy, state, epsilon = 0.05):
    if np.random.random_sample() > epsilon:
        # select best action
        return policy[state]
    else:
        # return random action from Q
        return np.random.choice(Q[state].shape[0])

In [4]:
def run_episode(Q, policy, state=0, alpha = 0.05, gamma = 0.9, epsilon=0.05):
    observation = env.reset()
    max_steps=100
    rewardTot = 0

    # Loop
    for t in range(max_steps):
        #env.render()

        # Sample action a_t from pi(s_t)
        action = sample_policy(Q, policy, state, epsilon)

        # Observe (r_t, s_t+1)
        observation, reward, done, info = env.step(action)

        # Update Q given (s_t, a_t, r_T, s_t+1)
        # Q(s_t, a_t) <- Q(s_t, a_t) + alpha*(r_t + gamma*max_a'(Q(s_t+1, a')) - Q(s_t, a_t))
        Q[state][action] += alpha*(reward + gamma*max(Q[observation]) - Q[state, action])
        
        if done:
            # Hack to update actions leading to negative terminal states
            Q[state][action] += alpha*(reward - Q[state, action])
            

        # Perform policy improvement
        # pi(s_t) = argmax_a(q(s_t, a))
        policy[state] = np.argmax(Q[state])

        state = observation
        rewardTot += reward

        if done:
            #print("Episode finished after {} timesteps".format(t+1))
            break
    
    return Q, policy, rewardTot

In [5]:
def exploit(Q, policy, state=0):
    observation = env.reset()
    max_steps=100
    rewardTot = 0

    # Loop
    for t in range(max_steps):
#        env.render()

        # Sample action a_t from pi(s_t)
        action = policy[state]

        # Observe (r_t, s_t+1)
        observation, reward, done, info = env.step(action)

    #    t = t+1
        state = observation
        
        rewardTot += reward

        if done:
#            print("Episode finished after {} timesteps".format(t+1))
            break
    
    return Q, policy, rewardTot

# Deterministic Environment

In [6]:
env = gym.make('Deterministic-8x8-FrozenLake-v0')
stateActionSpace = (env.nS, env.nA)
Q = np.ones(stateActionSpace)/10
policy = np.zeros(env.nS, dtype=int)

# Training
count_success = 0
episodes = 1000
for t in range(episodes):
    Q, policy, rewardTot = run_episode(Q, policy, epsilon=0.05)
    if rewardTot > 0:
        count_success += 1
print(count_success)

# Exploit
count_success = 0
episodes = 100
for t in range(episodes):
    Q, policy, rewardTot = exploit(Q, policy)
    if rewardTot > 0:
        count_success += 1
print(count_success)

  result = entry_point.load(False)


805
100


In [7]:
policy

array([2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 0, 1, 1, 0, 2, 2,
       2, 1, 2, 2, 2, 2, 1, 0, 2, 1, 3, 0, 3, 0, 2, 2, 2, 1, 1, 0, 0, 3,
       3, 3, 0, 1, 2, 0, 2, 0, 0, 1, 0, 1, 3, 1, 1, 0, 1, 2, 2, 0])

In [8]:
Q

array([[0.09902432, 0.05312387, 0.26533161, 0.11286854],
       [0.11888549, 0.08139651, 0.29481296, 0.0868545 ],
       [0.16976234, 0.0961745 , 0.32756998, 0.10861084],
       [0.13070887, 0.36396665, 0.09280044, 0.15315077],
       [0.07290482, 0.24779959, 0.07322396, 0.07330038],
       [0.07750351, 0.1779038 , 0.07810284, 0.07837858],
       [0.08204794, 0.08210392, 0.08194849, 0.08183201],
       [0.08433242, 0.08866192, 0.08433051, 0.08433891],
       [0.05707376, 0.05711487, 0.07078875, 0.05690351],
       [0.06102885, 0.06099156, 0.17033293, 0.06105352],
       [0.06512938, 0.06549011, 0.31218165, 0.0653082 ],
       [0.10701106, 0.05184222, 0.40440739, 0.12978284],
       [0.17024282, 0.11312254, 0.44934154, 0.10163195],
       [0.1801841 , 0.49926838, 0.09252147, 0.08872239],
       [0.08438961, 0.08487626, 0.20246972, 0.08476423],
       [0.08711056, 0.46071645, 0.08692452, 0.08669729],
       [0.06149643, 0.06139858, 0.06146909, 0.0610522 ],
       [0.06432794, 0.0648082 ,

# Stochastic Next-State

In [9]:
env = gym.make("Stochastic-8x8-FrozenLake-v0")
stateActionSpace = (env.nS, env.nA)
Q = np.ones(stateActionSpace)/10
policy = np.zeros(env.nS, dtype=int)

# Training
count_success = 0
episodes = 100000
for t in range(episodes):
    Q, policy, rewardTot = run_episode(Q, policy, epsilon=0.05)
    if rewardTot > 0:
        count_success += 1
print(count_success)

# Exploit
count_success = 0
episodes = 1000
for t in range(episodes):
    Q, policy, rewardTot = exploit(Q, policy)
    if rewardTot > 0:
        count_success += 1
print(count_success)

6192
94


In [10]:
policy

array([1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 0, 1, 2,
       1, 1, 1, 2, 2, 2, 1, 0, 2, 2, 1, 1, 1, 0, 1, 1, 3, 2, 1, 0, 0, 3,
       2, 0, 0, 2, 3, 0, 3, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 2, 1, 0])

In [11]:
Q

array([[0.01651028, 0.01753517, 0.01680936, 0.01672343],
       [0.01829638, 0.01933268, 0.02095914, 0.01927081],
       [0.02330581, 0.02303497, 0.02628005, 0.02237178],
       [0.0261381 , 0.02770425, 0.03138157, 0.02555673],
       [0.02983019, 0.02976232, 0.03264633, 0.02979662],
       [0.03395294, 0.03452448, 0.03709067, 0.0346706 ],
       [0.04047912, 0.04011374, 0.04541233, 0.03887344],
       [0.04466856, 0.04372812, 0.04975323, 0.04319855],
       [0.01838354, 0.01951367, 0.01885608, 0.01770295],
       [0.02068726, 0.02156359, 0.022474  , 0.02132186],
       [0.026187  , 0.02727449, 0.02950282, 0.02678904],
       [0.0340947 , 0.03211652, 0.03978601, 0.02924798],
       [0.03362064, 0.03722421, 0.03506622, 0.03434472],
       [0.03825183, 0.03844505, 0.04265316, 0.03801734],
       [0.04938037, 0.048696  , 0.05645839, 0.0468888 ],
       [0.05778968, 0.06196513, 0.05541034, 0.0536842 ],
       [0.02147761, 0.02312187, 0.02154992, 0.02143408],
       [0.02338347, 0.0282825 ,