In [5]:
import numpy as np
import random
import matplotlib.pyplot as plt

# Environment parameters
num_states = 6
num_actions = 6
gamma = 0.85  # Discount factor
alpha = 0.1   # Learning rate
epsilon = 0.1 # Exploration rate
num_episodes = 1000

# Initialize Q-table with zeros
Q = np.zeros((num_states, num_actions))

# Define the reward matrix (based on the provided matrix)
R = np.array([
    [-1, -1, -1, -1, 0, -1],
    [-1, -1, -1, 0, -1, 100],
    [-1, -1, -1, 0, -1, -1],
    [-1, 0, 0, -1, 0, -1],
    [0, -1, -1, 0, -1, 100],
    [-1, 0, -1, -1, 0, 100]
])

# Define the transition probabilities (uniform in this case)
def get_possible_actions(state):
    return [a for a in range(num_actions) if R[state, a] != -1]

# Q-Learning algorithm
for episode in range(num_episodes):
    state = random.randint(0, num_states - 1)  # Start from a random state
    done = False

    while not done:
        if random.uniform(0, 1) < epsilon:  # Exploration
            action = random.choice(get_possible_actions(state))
        else:  # Exploitation
            action = np.argmax(Q[state])

        # Take the action and observe the reward
        next_state = action  # In a simple grid world, the action can be assumed to be the next state
        reward = R[state, action]

        # Update Q-table
        best_next_action = np.argmax(Q[next_state])
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, best_next_action] - Q[state, action])

        # Transition to the next state
        state = next_state

        # End episode condition (in a simple case, you might want to define a condition to end the episode)
        done = True

print("Trained Q-table:")
print(Q)
# Policy derived from Q-table



Trained Q-table:
[[-1.00000000e-01 -1.00000000e-01 -1.00000000e-01 -1.00000000e-01
   4.85875141e+02  0.00000000e+00]
 [-1.00000000e-01 -1.00000000e-01 -1.00000000e-01  1.61402261e+02
   0.00000000e+00  5.91725940e+02]
 [-1.00000000e-01 -1.00000000e-01 -1.00000000e-01  4.05057820e+02
   0.00000000e+00  0.00000000e+00]
 [-1.00000000e-01  4.89655023e+02  1.57826291e+02  0.00000000e+00
   8.37340502e+01  0.00000000e+00]
 [ 1.85416361e+02  0.00000000e+00  0.00000000e+00  1.55864507e+02
   0.00000000e+00  5.89142641e+02]
 [-1.00000000e-01  1.39092825e+02  0.00000000e+00  0.00000000e+00
   1.12403051e+02  5.89895365e+02]]


In [6]:
def find_path(start_state, end_state):
    state = start_state
    path = [state]
    while state != end_state:
        action = np.argmax(Q[state])
        path.append(action)
        state = action
        if state == end_state:
            break
    return path

# Example: Find the path from state 0 to state 5
start_state = 2
end_state = 5
path = find_path(start_state, end_state)
print(f"Path from state {start_state} to state {end_state}: {path}")

Path from state 2 to state 5: [2, 3, 1, 5]
