In [5]:
import numpy as np
import random
import matplotlib.pyplot as plt

# Maze setup: 0 -> Free space, 1 -> Wall, 2 -> Goal
maze = np.array([
    [0, 1, 0, 0, 0],
    [0, 1, 0, 1, 0],
    [0, 0, 0, 1, 0],
    [0, 1, 1, 1, 0],
    [0, 0, 0, 0, 2]
])

In [6]:
# Hyperparameters
alpha = 0.1   # Learning rate
gamma = 0.9   # Discount factor
epsilon = 0.2 # Exploration factor
episodes = 1000

# Define actions
actions = ['up', 'down', 'left', 'right']

In [10]:

# Initialize Q-table with zeros
q_table = np.zeros((maze.shape[0], maze.shape[1], len(actions)))

# Helper functions
def get_next_position(row, col, action):
    if action == 'up':
        return max(row - 1, 0), col
    elif action == 'down':
        return min(row + 1, maze.shape[0] - 1), col
    elif action == 'left':
        return row, max(col - 1, 0)
    elif action == 'right':
        return row, min(col + 1, maze.shape[1] - 1)

def get_reward(next_row, next_col):
    if maze[next_row, next_col] == 2:  # Goal state
        return 100
    elif maze[next_row, next_col] == 1:  # Wall state
        return -100
    else:
        return -1  # Default step reward

def is_terminal_state(row, col):
    return maze[row, col] == 2

# Training
for episode in range(episodes):
    # Start at random position in the maze
    row, col = np.random.randint(maze.shape[0]), np.random.randint(maze.shape[1])

    # Continue exploring until the goal is reached
    while not is_terminal_state(row, col):
        # Choose action (epsilon-greedy)
        if random.uniform(0, 1) < epsilon:
            action_idx = np.random.randint(len(actions))
        else:
            action_idx = np.argmax(q_table[row, col])

        # Take action
        next_row, next_col = get_next_position(row, col, actions[action_idx])
        reward = get_reward(next_row, next_col)

        # Update Q-value
        best_next_action = np.argmax(q_table[next_row, next_col])
        q_table[row, col, action_idx] = q_table[row, col, action_idx] + \
            alpha * (reward + gamma * q_table[next_row, next_col, best_next_action] - q_table[row, col, action_idx])

        # Move to the next state
        row, col = next_row, next_col

# Print learned Q-table (optimal policy)
print("\nLearned Q-table:")
print(q_table)



Learned Q-table:
[[[ 13.84880063  41.75258214   8.67707306 -53.96356829]
  [-36.17881376 -19.009       -1.13619118  44.48653923]
  [ 12.15208261   5.90891542 -37.48966213  54.95021328]
  [ 21.51287747 -38.05800601  13.62446824  62.17095131]
  [ 33.47125577  70.18999986  24.0792001   44.59931717]]

 [[ 13.55907429  48.45814784  14.495431   -61.76163484]
  [-42.05063479   7.41408031  41.5022142   -0.80607188]
  [ 46.74331728   5.44735915 -40.20417662 -29.18061124]
  [ 19.47893093 -23.66596488   3.33948319  70.14262825]
  [ 54.87520373  79.1        -39.23806164  58.9968373 ]]

 [[ 34.93952342  54.95389997  31.65569687  18.22309538]
  [-38.12886902 -31.43063988  48.45261737   5.00990684]
  [ -1.8303419  -33.42738238  41.80923725 -33.78926647]
  [-21.69174785 -25.99546715   7.85602674  78.89405143]
  [ 64.98535457  89.         -38.9471248   70.87201919]]

 [[ 34.42619902  62.171       33.75718202 -36.94469715]
  [ 20.93625255  70.10973819  19.26088255 -19.93893536]
  [  9.56236658  79.0055

In [12]:

# Visualization of the optimal policy
policy = np.chararray(maze.shape, unicode=True)
policy[:] = ' '



In [13]:
for i in range(maze.shape[0]):
    for j in range(maze.shape[1]):
        if maze[i, j] == 1:
            policy[i, j] = '#'
        elif maze[i, j] == 2:
            policy[i, j] = 'G'
        else:
            best_action = np.argmax(q_table[i, j])
            policy[i, j] = actions[best_action][0].upper()

print("\nLearned Policy:")
print(policy)


Learned Policy:
[['D' '#' 'R' 'R' 'D']
 ['D' '#' 'U' '#' 'D']
 ['D' 'L' 'L' '#' 'D']
 ['D' '#' '#' '#' 'D']
 ['R' 'R' 'R' 'R' 'G']]
