In [1]:
import numpy as np
import random

class MazeEnvironment:
    def __init__(self, maze_size):
        self.maze_size = maze_size
        self.maze = np.zeros(maze_size)
        self.start = (0, 0)
        self.goal = (maze_size[0] - 1, maze_size[1] - 1)
        self.current_position = self.start
        self.actions = [(1, 0), (-1, 0), (0, 1), (0, -1)]  # Down, Up, Right, Left
        self.Q_values = np.zeros((maze_size[0], maze_size[1], len(self.actions)))

    def take_action(self, action):
        new_position = (self.current_position[0] + action[0], self.current_position[1] + action[1])
        if 0 <= new_position[0] < self.maze_size[0] and 0 <= new_position[1] < self.maze_size[1]:
            self.current_position = new_position

    def get_reward(self):
        if self.current_position == self.goal:
            return 10
        else:
            return -1

    def is_terminal(self):
        return self.current_position == self.goal

    def reset(self):
        self.current_position = self.start

def q_learning(env, num_episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1):
    for episode in range(num_episodes):
        env.reset()
        while not env.is_terminal():
            current_state = env.current_position
            if random.uniform(0, 1) < epsilon:
                action = random.choice(env.actions)
            else:
                action = env.actions[np.argmax(env.Q_values[current_state[0], current_state[1]])]

            env.take_action(action)
            reward = env.get_reward()
            new_state = env.current_position
            if not env.is_terminal():
                max_future_q = np.max(env.Q_values[new_state[0], new_state[1]])
                current_q = env.Q_values[current_state[0], current_state[1], env.actions.index(action)]
                new_q = (1 - alpha) * current_q + alpha * (reward + gamma * max_future_q)
                env.Q_values[current_state[0], current_state[1], env.actions.index(action)] = new_q
            else:
                env.Q_values[current_state[0], current_state[1], env.actions.index(action)] = reward
    return env.Q_values

# Define the maze size
maze_size = (5, 5)

# Create a maze environment
maze_env = MazeEnvironment(maze_size)

# Train the Q-learning agent
trained_Q_values = q_learning(maze_env)

# Test the trained agent (finding the path)
path = []
maze_env.reset()
while not maze_env.is_terminal():
    current_state = maze_env.current_position
    action = maze_env.actions[np.argmax(trained_Q_values[current_state[0], current_state[1]])]
    maze_env.take_action(action)
    path.append(maze_env.current_position)

print("Path taken by the agent:", path)


Path taken by the agent: [(0, 1), (1, 1), (1, 2), (2, 2), (2, 3), (2, 4), (3, 4), (4, 4)]
