In [1]:
# Import OpenAI gym 
import gym

In [2]:
# Import Numpy
import numpy as np

In [3]:
# Make an FrozenLake-v0 game envirnoment which is not slippery
env = gym.make('FrozenLake-v0', is_slippery = False)

In [4]:
# Reset the environment 
env.reset() 

0

In [5]:
# Render the environment 
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [6]:
# Show the size of states and the size of actions 
print(env.observation_space.n)
print(env.action_space.n)

16
4


In [7]:
# Create and initialize the matrix of Q-values, which is named Q.
Q = np.array(np.zeros([env.observation_space.n,env.action_space.n]))

In [8]:
# Print Q
print(Q)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [9]:
# Set the parameters alpha, gamma and the number of episodes (iterations) for Q-learning to be 0.6, 0.75 and 1000, respectively
gamma = 0.75
alpha = 0.6
num_episodes= 1000

In [10]:
# Training mode. Update the matrix of Q-values using the Bellman equation. 
import random
for episode in range(num_episodes):
    state = env.reset()
    done = False
    reward,total_rewards = 1,0
    while done != True:
        action = np.random.choice(env.action_space.n)
        new_state, reward, done, info = env.step(action)
        Q[state, action] = Q[state, action] +(reward + gamma * np.max(Q[new_state, :]) - Q[state, action])
        total_rewards += reward
        state = new_state

In [11]:
# Print the Q table
print(Q)   
print(Q[14, 2])

[[0.17797852 0.23730469 0.23730469 0.17797852]
 [0.17797852 0.         0.31640625 0.23730469]
 [0.23730469 0.421875   0.23730469 0.31640625]
 [0.31640625 0.         0.23730469 0.23730469]
 [0.23730469 0.31640625 0.         0.17797852]
 [0.         0.         0.         0.        ]
 [0.         0.5625     0.         0.31640625]
 [0.         0.         0.         0.        ]
 [0.31640625 0.         0.421875   0.23730469]
 [0.31640625 0.5625     0.5625     0.        ]
 [0.421875   0.75       0.         0.421875  ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.5625     0.75       0.421875  ]
 [0.5625     0.75       1.         0.5625    ]
 [0.         0.         0.         0.        ]]
1.0


In [12]:
""" Inference Mode 
Utilize the updated matrix of Q-values above to find a route to move from S to G. 
The variable “route” is a list holding the states (in order) in the route. Initially, it holds just the state 0 as it starts at S. 
The variable “actions” is a list holding the actions done (in order) in moving along the route. Initially, it is empty.
"""
route = [0]
actions = [ ]
state = env.reset()
step = 0
done = False
while done != True:
    state = env.reset()
    step = 0
    for step in range(1000):
        action = np.argmax(Q[state,:])
        new_state, reward, done, info = env.step(action)
        route.append(new_state)
        actions.append(action)
        if new_state == 15:
          break
        state = new_state

In [13]:
# Print the route and the actions to take 
print(route)
print(actions)

[0, 4, 8, 9, 13, 14, 15]
[1, 1, 2, 1, 2, 2]


In [14]:
# Display the states of the environment and the actions done, one after another, in moving along the route, starting from S to G.
env.reset()
env.render()
for i in actions:
  env.step(i)
  env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
