### Closed Maze

In this notebook, we have trained a reinforcement learning model for a closed maze. The closed maze is imagined like a sphere. For example, if we reach end of first row of the maze and we are moving up, the agent should reach end of the bottom row. A fun way of imagining a maze and seeing whether the agent learns this structure.

In [2]:
import pygame
import numpy as np
import sys
import matplotlib.pyplot as plt

pygame 2.4.0 (SDL 2.26.4, Python 3.8.8)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
def create_maze(dim, num_of_walls):
    # Define the maze
    maze = np.zeros((dim,dim))

    #Select random walls
    for i in range(num_of_walls):
        a,b = np.random.random_integers(0,dim-1,2)
        maze[a,b] = 1

    # Define the goal state
    maze[dim-1, dim-1] = 2
    goal_state = (dim-1, dim-1)
    return maze, goal_state

In [4]:
# Define the actions and their corresponding changes in position
actions = ['up', 'down', 'left', 'right']
action_deltas = [(-1, 0), (1, 0), (0, -1), (0, 1)]

In [5]:
# Define the rewards for each state
rewards = {
    0: -1,   # Maze cell
    1: -2,   # Wall
    2: 10    # Goal
}

In [6]:
# Define a function to choose an action using the epsilon-greedy policy
def choose_action(state, epsilon):
    if np.random.random() < epsilon:
        # Choose a random action
        return actions[np.random.randint(0, len(actions))]
    else:
        # Choose the action with the highest Q-value
        return actions[np.argmax(Q[state[0], state[1]])]

In [7]:
# Define a function to update the Q-value using SARSA
def update_q_value(state, action, reward, next_state, next_action, alpha, gamma):
    # Calculate the TD error
    td_error = reward + gamma * Q[next_state[0], next_state[1], actions.index(next_action)] - Q[state[0], state[1], actions.index(action)]
    
    # Update the Q-value for the current state and action
    Q[state[0], state[1], actions.index(action)] += alpha * td_error

In [8]:
#Define the colors to use
BLACK = (0, 0, 0)
WHITE = (255, 255, 255)
RED = (255, 0, 0)
GREEN = (0, 255, 0)
BLUE = (0, 0, 255)
YELLOW = (255,255,0)

In [9]:
# Define a function to simulate an episode for a closed maze
def run_episode_closed_maze(alpha, gamma, epsilon, training=True):
    # Choose a random starting state that is not the goal state
    state = (np.random.randint(0, maze.shape[0]), np.random.randint(0, maze.shape[1]))

    while state == goal_state:
        state = (np.random.randint(0, maze.shape[0]), np.random.randint(0, maze.shape[1]))
    
    init = state
    
    if training == False:
        # Clear the screen
        screen.fill(WHITE)

        # Draw the maze and the goal
        for i in range(maze.shape[0]):
            for j in range(maze.shape[1]):
                rect = pygame.Rect(j*cell_size, i*cell_size, cell_size, cell_size)
                if i == init[0] and j == init[1]: #Start cell
                    pygame.draw.rect(screen, YELLOW, rect)
                    pygame.draw.circle(screen, RED, (state[1]*cell_size+25,state[0]*cell_size+25), 5)
                elif maze[i,j] == 0:  # Maze cell
                    pygame.draw.rect(screen, BLACK, rect)
                elif maze[i,j] == 1:  #Wall
                    pygame.draw.rect(screen, BLUE, rect)
                elif maze[i,j] == 2: #Goal
                    pygame.draw.rect(screen, GREEN, rect)
    
    # Choose the first action using an epsilon-greedy policy
    action = choose_action(state, epsilon)
    
    # Initialize the total reward
    total_reward = 0
    
    # Repeat until the goal state is reached
    while state != goal_state:
        # Move to the next state
        delta = action_deltas[actions.index(action)]
        next_state = (state[0]+delta[0], state[1]+delta[1])
        
        # If the next state is outside the maze, set it to the closed loop state
        if next_state[0] < 0:
            print('Before looping-',next_state)
            x = dim-1
            y = next_state[1]
            next_state = (x,y)
            print('After looping-',next_state)
        elif next_state[0] >= maze.shape[0]:
            print('Before looping-',next_state)
            x = 0
            y = next_state[1]
            next_state = (x,y)
            print('After looping-',next_state)
        elif next_state[1] < 0:
            print('Before looping-',next_state)
            x = next_state[0]
            y = dim-1
            next_state = (x,y)
            print('After looping-',next_state)
        elif next_state[1] >= maze.shape[1]:
            print('Before looping-',next_state)
            x = next_state[0]
            y = 0
            next_state = (x,y)
            print('After looping-',next_state)
        
        # Get the reward for the next state
        reward = rewards[maze[next_state[0], next_state[1]]]
    
        # Update the Q-value using SARSA
        next_action = choose_action(next_state, epsilon)
        if training == True:
            update_q_value(state, action, reward, next_state, next_action, alpha, gamma)

        # Update the total reward and current state and action
        total_reward += reward
        state = next_state
        action = next_action
        
        if training == False:
            # Draw the current state of the agent
            for i in range(maze.shape[0]):
                for j in range(maze.shape[1]):
                    if i == state[0] and j == state[1]: #Next cell
                        pygame.draw.circle(screen, RED, (state[1]*cell_size+25,state[0]*cell_size+25), 5)

    return total_reward

In [10]:
# Train the agent for the specified number of episodes
alpha = 0.5
gamma = 0.9
epsilon = 0.2

#Create maze
dim=10
maze, goal_state = create_maze(dim,15)

# Initialize the Q-matrix with zeros
Q = np.zeros((maze.shape[0], maze.shape[1], 4))

for i in range(100):
    total_reward = run_episode_closed_maze(alpha, gamma, epsilon)
    
    # Print the total reward for the episode
    print(f"Episode {i+1}: Total Reward = {total_reward}")

Before looping- (-1, 0)
After looping- (9, 0)
Before looping- (10, 0)
After looping- (0, 0)
Before looping- (-1, 1)
After looping- (9, 1)
Before looping- (-1, 2)
After looping- (9, 2)
Before looping- (-1, 3)
After looping- (9, 3)
Before looping- (10, 2)
After looping- (0, 2)
Before looping- (10, 1)
After looping- (0, 1)
Before looping- (4, -1)
After looping- (4, 9)
Before looping- (2, 10)
After looping- (2, 0)
Before looping- (2, -1)
After looping- (2, 9)
Before looping- (-1, 9)
After looping- (9, 9)
Episode 1: Total Reward = -73
Before looping- (-1, 6)
After looping- (9, 6)
Before looping- (-1, 7)
After looping- (9, 7)
Before looping- (10, 7)
After looping- (0, 7)
Before looping- (-1, 8)
After looping- (9, 8)
Before looping- (-1, 5)
After looping- (9, 5)
Before looping- (10, 5)
After looping- (0, 5)
Before looping- (-1, 4)
After looping- (9, 4)
Before looping- (10, 3)
After looping- (0, 3)
Before looping- (8, -1)
After looping- (8, 9)
Before looping- (6, 10)
After looping- (6, 0)
Befo

  a,b = np.random.random_integers(0,dim-1,2)


In [11]:
#Initialize Pygame
pygame.init()

#Define the size of each cell in the maze and the size of the window
cell_size = 50
window_size = (maze.shape[1]*cell_size, maze.shape[0]*cell_size)

#Create the window
screen = pygame.display.set_mode(window_size)
pygame.display.set_caption("Maze")
clock = pygame.time.Clock()

# Test the agent for the specified number of episodes
for i in range(10):
    total_reward = run_episode_closed_maze(alpha, gamma, epsilon, training=False)
    
    # Print the total reward for the episode
    print(f"Episode {i+1}: Total Reward = {total_reward}")
    
    # Update the Pygame window
    pygame.display.flip()
    
    # Pause briefly to allow the user to see the visualization
    pygame.time.wait(3000)

# Wait for the user to close the Pygame window
while True:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            pygame.quit()
            sys.exit()

Episode 1: Total Reward = 3
Episode 2: Total Reward = 9
Episode 3: Total Reward = 0
Episode 4: Total Reward = 7
Episode 5: Total Reward = 8
Before looping- (6, -1)
After looping- (6, 9)
Episode 6: Total Reward = 2
Before looping- (6, -1)
After looping- (6, 9)
Episode 7: Total Reward = -10
Before looping- (-1, 5)
After looping- (9, 5)
Episode 8: Total Reward = -28
Before looping- (-1, 8)
After looping- (9, 8)
Episode 9: Total Reward = -36
Before looping- (6, -1)
After looping- (6, 9)
Episode 10: Total Reward = 3


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
