In [2]:
import numpy as np
import random
import matplotlib.pyplot as plt

class MazeEnv:
    def __init__(self, width=10, height=10, start=(0, 0), goal=(9, 9), obstacles=None):
        self.width = width
        self.height = height
        self.start = start
        self.current_position = start
        self.goal = goal
        if obstacles is None:
            obstacles = []
        self.obstacles = obstacles

    def reset(self):
        self.current_position = self.start
        return self.current_position

    def step(self, action):
        next_position = list(self.current_position)
        if action == 0:  # Up
            next_position[1] -= 1
        elif action == 1:  # Right
            next_position[0] += 1
        elif action == 2:  # Down
            next_position[1] += 1
        elif action == 3:  # Left
            next_position[0] -= 1

        if (0 <= next_position[0] < self.width and
            0 <= next_position[1] < self.height and
            tuple(next_position) not in self.obstacles):
            self.current_position = tuple(next_position)

        reward = -1  # default reward
        done = False
        if self.current_position == self.goal:
            reward = 100
            done = True

        return self.current_position, reward, done

    def render(self):
        maze = np.zeros((self.height, self.width))
        for obstacle in self.obstacles:
            maze[obstacle[1]][obstacle[0]] = -1
        maze[self.goal[1]][self.goal[0]] = 0.5
        maze[self.current_position[1]][self.current_position[0]] = 1
        plt.imshow(maze)
        plt.show()

def q_learning(env, episodes=500, alpha=0.1, gamma=0.99, epsilon=0.1):
    q_table = np.zeros((env.height, env.width, 4))
    for episode in range(episodes):
        state = env.reset()
        done = False
        while not done:
            if random.uniform(0, 1) < epsilon:
                action = random.choice([0, 1, 2, 3])
            else:
                action = np.argmax(q_table[state[1], state[0]])

            next_state, reward, done = env.step(action)

            old_value = q_table[state[1], state[0], action]
            next_max = np.max(q_table[next_state[1], next_state[0]])

            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            q_table[state[1], state[0], action] = new_value

            state = next_state

    return q_table

if __name__ == "__main__":
    obstacles = [(1, 1), (2, 2), (3, 3)]
    env = MazeEnv(width=10, height=10, start=(0, 0), goal=(9, 9), obstacles=obstacles)
    q_table = q_learning(env)
    print(q_table)
    print("Q-Learning completed.")


[[[ 3.90251828e+01  6.85884161e+01  1.14233630e+00  1.46486740e+01]
  [ 4.49134695e+01  7.02914859e+01  1.39407148e+01  3.10686781e+01]
  [ 1.83563426e+01  7.20116491e+01  3.90689374e-01  4.59735714e+01]
  [ 2.14644782e+01  7.37491551e+01  1.54248198e+01  4.88479437e+01]
  [ 3.83787901e+01  1.15518943e+00  7.55042028e+01  3.20898970e+01]
  [-3.15008834e+00 -3.17983619e+00  2.32352744e+01 -3.23555157e+00]
  [-2.75115378e+00 -2.72092491e+00  1.99741086e+00 -2.70497108e+00]
  [-2.36198355e+00 -2.35832290e+00 -2.33694648e+00 -2.30457284e+00]
  [-2.07417514e+00 -2.08133187e+00 -2.01688352e+00 -2.01783863e+00]
  [-1.97139021e+00 -1.98111352e+00 -1.99329691e+00 -1.93833626e+00]]

 [[ 2.79494143e+01 -6.35081395e+00 -6.27073080e+00 -6.27442960e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 2.79059657e+00  1.66530790e+01 -4.74915299e+00 -4.77039628e+00]
  [-4.06173113e+00  5.99850798e+01 -4.06482455e+00 -4.23832954e+00]
  [ 5.13395173e+01  2.26827293e+01  7.72769740

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import random

class ComplexMazeEnv:
    def __init__(self, size=10, start=(0, 0), goal=None, complexity=0.3, density=0.3):
        self.size = size
        self.start = start
        self.complexity = complexity
        self.density = density
        self.maze = self._generate_maze()
        self.goal = goal if goal else (size - 1, size - 1)
        self.current_position = start

    def _generate_maze(self):
        shape = ((self.size // 2) * 2 + 1, (self.size // 2) * 2 + 1)
        complexity = int(self.complexity * (5 * (shape[0] + shape[1])))
        density = int(self.density * ((shape[0] // 2) * (shape[1] // 2)))
        maze = np.zeros(shape, dtype=bool)

        maze[0, :] = maze[-1, :] = maze[:, 0] = maze[:, -1] = 1
        for i in range(density):
            x, y = random.randint(0, shape[1] // 2) * 2, random.randint(0, shape[0] // 2) * 2
            maze[y, x] = 1
            for j in range(complexity):
                neighbours = []
                if x > 1:           neighbours.append((y, x - 2))
                if x < shape[1] - 2: neighbours.append((y, x + 2))
                if y > 1:           neighbours.append((y - 2, x))
                if y < shape[0] - 2: neighbours.append((y + 2, x))
                if len(neighbours):
                    y_, x_ = neighbours[random.randint(0, len(neighbours) - 1)]
                    if maze[y_, x_] == 0:
                        maze[y_, x_] = 1
                        maze[y_ + (y - y_) // 2, x_ + (x - x_) // 2] = 1
                        x, y = x_, y_
        return maze

    def reset(self):
        self.current_position = self.start
        return self.current_position

    def step(self, action):
        y, x = self.current_position
        if action == 0 and y > 0 and not self.maze[y - 1, x]:  # Up
            self.current_position = (y - 1, x)
        if action == 1 and x < self.size - 1 and not self.maze[y, x + 1]:  # Right
            self.current_position = (y, x + 1)
        if action == 2 and y < self.size - 1 and not self.maze[y + 1, x]:  # Down
            self.current_position = (y + 1, x)
        if action == 3 and x > 0 and not self.maze[y, x - 1]:  # Left
            self.current_position = (y, x - 1)

        reward = -0.1
        done = False
        if self.current_position == self.goal:
            reward = 10
            done = True

        return self.current_position, reward, done

    def render(self):
        plt.figure(figsize=(5, 5))
        plt.imshow(self.maze, cmap='binary')
        y, x = self.start
        gy, gx = self.goal
        plt.scatter([x, gx], [y, gy], color=['green', 'red'])  # start and goal
        cy, cx = self.current_position
        plt.scatter(cx, cy, color='blue')  # current position
        plt.show()

def q_learning(env, episodes=1000, alpha=0.1, gamma=0.99, epsilon=0.1):
    q_table = np.random.uniform(low=-1, high=1, size=(env.size, env.size, 4))
    for episode in range(episodes):
        state = env.reset()
        done = False
        while not done:
            if random.uniform(0, 1) < epsilon:
                action = random.randint(0, 3)
            else:
                action = np.argmax(q_table[state[0], state[1]])

            next_state, reward, done = env.step(action)
            old_value = q_table[state[0], state[1], action]
            next_max = np.max(q_table[next_state[0], next_state[1]])

            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            q_table[state[0], state[1], action] = new_value
            state = next_state

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}/{episodes} completed")

    return q_table

if __name__ == "__main__":
    env = ComplexMazeEnv(size=15, complexity=0.2, density=0.3)
    q_table = q_learning(env, episodes=1000)
    print("Complex Q-Learning with custom maze environment completed.")


In [1]:
import numpy as np
import random

# --- Custom Maze Environment ---

class MazeEnv:
    def __init__(self, size=5):
        self.size = size
        self.maze = np.zeros((size, size))
        self.generate_maze()
        self.start = (0, 0)
        self.goal = (size - 1, size - 1)

    def generate_maze(self):
        # Create obstacles (example)
        self.maze[1:3, 2] = 1

    def reset(self):
        self.agent_pos = self.start
        return self.state_to_index(self.agent_pos)

    def step(self, action):
        new_row, new_col = self.agent_pos

        if action == 0:  # Up
            new_row -= 1
        elif action == 1:  # Down
            new_row += 1
        elif action == 2:  # Left
            new_col -= 1
        elif action == 3:  # Right
            new_col += 1

        new_pos = (new_row, new_col)

        if self.is_valid_move(new_pos):
            self.agent_pos = new_pos
            if new_pos == self.goal:
                reward = 100
                done = True
            else:
                reward = -1
                done = False
        else:
            reward = -5  # Penalty for invalid move
            done = False

        return self.state_to_index(new_pos), reward, done

    def is_valid_move(self, pos):
        row, col = pos
        return (0 <= row < self.size and 0 <= col < self.size and
                self.maze[row, col] == 0)

    def state_to_index(self, state):
        row, col = state
        return min(row * self.size + col, self.size * self.size - 1)

# --- Q-Learning Implementation ---

class QLearningAgent:
    def __init__(self, env, learning_rate=0.8, discount_factor=0.95):
        self.env = env
        self.lr = learning_rate
        self.gamma = discount_factor
        self.actions = [0, 1, 2, 3]  # Up, Down, Left, Right
        self.q_table = np.zeros((env.size * env.size, len(self.actions)))

    def choose_action(self, state, epsilon=0.1):
        if random.random() < epsilon:
            return random.choice(self.actions)
        else:
            return np.argmax(self.q_table[state])

    def learn(self, state, action, reward, next_state):
        current_q = self.q_table[state, action]
        max_future_q = np.max(self.q_table[next_state])
        new_q = (1 - self.lr) * current_q + self.lr * (reward + self.gamma * max_future_q)
        self.q_table[state, action] = new_q

# --- Training ---

if __name__ == "__main__":
    env = MazeEnv()
    agent = QLearningAgent(env)
    episodes = 5000

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.learn(state, action, reward, next_state)
            state = next_state

    print("Training complete!")


Training complete!
