In [1]:
import numpy as np

# Set Up Env.
grid_rows = 2
grid_cols = 3
num_actions = 4

# Define the reward matrix
rewards = np.array([
    [-10, 1, 0],
    [0, -10, 10]
]) # Here we have initialized the start state to be -10 as per the instructions given in the pdf

# Define the Q-Table
q_table = np.zeros((grid_rows, grid_cols, num_actions)) # Here we define a numpy 3d array that contains zeros throughout the array

# Set hyper-parameters
alpha = 0.1  # learning rate
gamma = 0.9  # discount factor
epsilon = 0.1  # exploration rate

In [2]:
# Defining a function to help us move in the grid
def move(action, current_state):
    next_state = current_state
    if action == 0:  # In this step, we are moving upwards in the column
        if current_state[0] > 0: # Here, we check to make sure we are not currently in the top row else we cannot move upwards
            next_state = (current_state[0] - 1, current_state[1]) # If we get to this point, there is space to move upwards in the column so we update the index by reducing 1 to 'move upwards'
    elif action == 1:  # In this step, we are moving downwards in the column
        if current_state[0] < grid_rows - 1: # Here, we check to make sure we are not currently in the bottom row else we cannot move downwards
            next_state = (current_state[0] + 1, current_state[1]) # If we get to this point, there is space to move downwards in the column so we update the index by adding 1 to 'move downwards'
    elif action == 2:  # In this step, we are moving left in the row
        if current_state[1] > 0: # Here, we check to make sure we are not currently in the left-most row else we cannot move left
            next_state = (current_state[0], current_state[1] - 1) # If we get to this point, there is space to move left in the row so we update the index by reducing 1 to 'move left'
    elif action == 3:  # In this step, we are moving right in the row
        if current_state[1] < grid_cols - 1: # Here, we check to make sure we are not currently in the right-most row else we cannot move right
            next_state = (current_state[0], current_state[1] + 1) # If we get to this point, there is space to move right in the row so we update the index by adding 1 to 'move right'

    return next_state

In [3]:
def q_value(current_state, next_state, action, reward):
  if next_state is not None:
    q_table[current_state][action] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[current_state][action]) # This is the formula as shown in class to determine the q value

In [4]:
# Defining the function for the Q learning algorithm
def q_learn(episodes):
  for episode in range(episodes):
    current_state = (0, 0)  # This is the start state and will also hold the current state.
    completed = False # This will loop the program and will be updated when the goal is achieved or isn't reached in less than 4 moves.
    number_of_moves = 0 # This keeps track of the number of moves made so far and is updated with every move.

    while not completed:
      randomNo = np.random.uniform() # This generates a random number between 0 and 1 to be used in exploration vs exploitation

      # Exploration versus exploitation trade-off
      if randomNo < epsilon:
        action = np.random.randint(num_actions)  # To explore, a random action will be selected amongst up, down, left, and right
      else:
        action = np.argmax(q_table[current_state])  # To exploit, we select the action with the highest Q value to get the best chance at getting rewarded

      next_state = current_state  # next_state is initialized to the current state. If there are no valid movements based on the action selected, the next state will remain the same as the current state.
      reward = None # The reward for the current action has not yet been determined but will be updated based on the next state transition

      next_state = move(action,current_state) # We call the `move` function and set its parameters

      if next_state is not None: # This section will be run if a valid movement was made
        reward = rewards[next_state] # We are rewarded based on the reward matrix defined for each state
        if(reward == 10):
          completed = True # We have reached the goal state
      else:
        reward = -1 # We haven't reached the goal state so we are given a -1 reward
        number_of_moves += 1 # we have moved once so we add 1 to the number of moves made

      q_value(current_state, next_state, action, reward) # We call the `q_value` function and set its parameters

      current_state = next_state

      if number_of_moves >= 4: # We stop the episode if the number of moves exceeds 3
        completed = True

In [5]:
q_learn(20_000) # We have set the number of episodes to 20000 as per instruction

In [6]:
print(q_table)

[[[-1.  0. -1. 10.]
  [10. -1. -1.  9.]
  [ 9. 10. 10.  9.]]

 [[-1.  0.  0. -1.]
  [10. -1.  0. 10.]
  [ 0.  0.  0.  0.]]]
