In [None]:
import numpy as np

# This is a maze model


# for TD function
gamma = 0.75 #represents discount factor
alpha = 0.9 #represents learning rate


# all the locations of the maze grid as states (inputs)
states = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7,
    'H': 8,
    'I': 9,
    'J': 10,
    'K': 11,
    'L': 12
}

# all locations of maze grid as actions (output)
actions = [0,1,2,3,4,5,6,7,8,9,10,11]


# create q values matrix that will hold the Q-value for each (state, action) as it updates
Q = np.array(np.zeros([12,12])) 

# rewards matrix defines which actions are possible or preferable based on current state
R = np.array([[0,1,0,0,0,0,0,0,0,0,0,0],
              [1,0,1,0,0,1,0,0,0,0,0,0],
              [0,1,0,0,0,0,1,0,0,0,0,0],
              [0,0,0,0,0,0,0,1,0,0,0,0],
              [0,0,0,0,0,0,0,0,1,0,0,0],
              [0,1,0,0,0,0,0,0,0,1,0,0],
              [0,0,1,0,0,0,1000,1,0,0,0,0],
              [0,0,0,1,0,0,1,0,0,0,0,1],
              [0,0,0,0,1,0,0,0,0,1,0,0],
              [0,0,0,0,0,1,0,0,1,0,1,0],
              [0,0,0,0,0,0,0,0,0,1,0,1],
              [0,0,0,0,0,0,0,1,0,0,1,0]])



# training over 1000 instances to find Q-values
for i in range(1000):
    current_state = np.random.randint(0,12) #choose random value each iteration
    playable_actions = []
    for j in range(12): #define possible actions and weight rewards
        if R[current_state, j] > 0:
            playable_actions.append(j)
    next_state = np.random.choice(playable_actions) #choose random possible actions

    #Calculate Q-value
    TD = R[current_state, next_state] + gamma * Q[next_state, np.argmax(Q[next_state,])] - Q[current_state, next_state]
    Q[current_state, next_state] = Q[current_state, next_state] + alpha * TD

print("Q-values:")
print(Q.astype(int))




In [None]:
# inference mode

locations = {state: location for location, state in states.items()}


def optimal_path(start_location, end_location):
    path = []
    current_state = states[start_location]
    while current_state != states[end_location]:
        
        q_index = np.argmax(Q[current_state])
        path += locations[q_index]
        new_location = locations[q_index]
        current_state = states[new_location] 
    


path = optimal_path("E", "G")
print(path)
    
    