In [1]:
import numpy as np

In [2]:
# Creating environment
env_rows = 3
env_columns = 3

start_state = (2, 0)
goal_state = (0, 2)

# Create a 3D numpy array to hold the current Q-values for each state and action pair: Q(s, a) 
q_values = np.zeros((env_rows, env_columns, 4))

# Define actions
# Numeric action codes: 0 = up, 1 = right, 2 = down, 3 = left
actions = ['up', 'right', 'down', 'left']

# States were defined as rewards
rewards = np.full((env_rows, env_columns), 0)
rewards[0, 2] = 1.
rewards[1, 1] = -10.
print(rewards)

np.random.seed(np.random.randint(10))

[[  0   0   1]
 [  0 -10   0]
 [  0   0   0]]


In [3]:
# Define a function that determines if the specified location is a terminal state
def is_terminal_state(current_row_index, current_column_index):
    return rewards[current_row_index, current_column_index] != 0.

def get_next_action(start_state_x, start_state_y, epsilon): #epsilon-greedy
    if np.random.random() < epsilon: #Explore
        return np.random.randint(4) 
    else:  # Otherwise, choose a random action, exploit
        return np.argmax(q_values[start_state_x, start_state_y]) 
    

def get_next_location(current_row_index, current_column_index, action_index):
    new_row_index = current_row_index
    new_column_index = current_column_index
    if actions[action_index] == 'up' and current_row_index > 0:
        new_row_index -= 1
    elif actions[action_index] == 'right' and current_column_index < env_columns - 1:
        new_column_index += 1
    elif actions[action_index] == 'down' and current_row_index < env_rows - 1:
        new_row_index += 1
    elif actions[action_index] == 'left' and current_column_index > 0:
        new_column_index -= 1
    return new_row_index, new_column_index

def get_shortest_path(start_row_index, start_column_index):
    current_row_index, current_column_index = start_row_index, start_column_index
    shortest_path = []
    shortest_path.append([current_row_index, current_column_index])
    while not is_terminal_state(current_row_index, current_column_index):
        action_index = get_next_action(current_row_index, current_column_index, epsilon)
        current_row_index, current_column_index = get_next_location(current_row_index, current_column_index, action_index)
        shortest_path.append([current_row_index, current_column_index])
    return shortest_path



In [4]:
epsilon = 0.1
discount_factor = 0.9
learning_rate = 0.5
num_episodes = 1000


# Run through 1000 training episodes
for episode in range(num_episodes):
    
    row_index, column_index = start_state[0], start_state[1]
    # Continue taking actions until reaching a terminal state
    while not is_terminal_state(row_index, column_index):
        action_index = get_next_action(row_index, column_index, epsilon)
        old_row_index, old_column_index = row_index, column_index
        row_index, column_index = get_next_location(row_index, column_index, action_index)

        reward = rewards[row_index, column_index]
        old_q_value = q_values[old_row_index, old_column_index, action_index]
        temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value

        new_q_value = old_q_value + (learning_rate * temporal_difference)
        q_values[old_row_index, old_column_index, action_index] = new_q_value

        # Check if the current state is a terminal state
        if is_terminal_state(row_index, column_index):
            break  # Exit the loop if a terminal state is reached

print('Training complete!')
print('Shortest Path:', get_shortest_path(2,0))


Training complete!
Shortest Path: [[2, 0], [1, 0], [0, 0], [0, 1], [0, 2]]
