In [1]:
import pygame
import numpy as np
import sys

pygame 2.4.0 (SDL 2.26.4, Python 3.8.8)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# Define the maze and goal state
maze = np.array([
    [0, 0, 0, 0, 0],
    [0, 1, 0, 1, 0],
    [0, 0, 0, 1, 0],
    [0, 1, 0, 1, 0],
    [0, 0, 0, 0, 2]
])
goal_state = (4, 4)

In [3]:
# Define the actions and their corresponding changes in position
actions = ['up', 'down', 'left', 'right']
action_deltas = [(-1, 0), (1, 0), (0, -1), (0, 1)]

In [4]:
# Define the number of episodes to train the agent
num_episodes = 1000

# Define the learning rate and discount factor
alpha = 0.5
gamma = 0.9

# Define the epsilon value for the epsilon-greedy policy
epsilon = 0.1

In [5]:
# Initialize the Q-matrix with zeros
Q = np.zeros((maze.shape[0], maze.shape[1], 4))

# Define the rewards for each state
rewards = {
    0: -1,   # Maze cell
    1: -1,   # Wall
    2: 10    # Goal
}

In [6]:
# Define a function to choose an action using the epsilon-greedy policy
def choose_action(state):
    if np.random.random() < epsilon:
        # Choose a random action
        return actions[np.random.randint(0, len(actions))]
    else:
        # Choose the action with the highest Q-value
        return actions[np.argmax(Q[state[0], state[1]])]

In [7]:
# Define a function to update the Q-value using SARSA
def update_q_value(state, action, reward, next_state, next_action):
    # Calculate the TD error
    td_error = reward + gamma * Q[next_state[0], next_state[1], actions.index(next_action)] - Q[state[0], state[1], actions.index(action)]
    
    # Update the Q-value for the current state and action
    Q[state[0], state[1], actions.index(action)] += alpha * td_error

In [8]:
#Define the colors to use
BLACK = (0, 0, 0)
WHITE = (255, 255, 255)
RED = (255, 0, 0)
GREEN = (0, 255, 0)
BLUE = (0, 0, 255)
YELLOW = (255,255,0)

In [9]:
# Define a function to simulate an episode
def run_episode(training=True):
    # Choose a random starting state that is not the goal state
    state = (np.random.randint(0, maze.shape[0]), np.random.randint(0, maze.shape[1]))

    while state == goal_state:
        state = (np.random.randint(0, maze.shape[0]), np.random.randint(0, maze.shape[1]))
    
    init = state
    
    # Clear the screen
    screen.fill(WHITE)
    
    # Draw the maze and the goal
    for i in range(maze.shape[0]):
        for j in range(maze.shape[1]):
            rect = pygame.Rect(j*cell_size, i*cell_size, cell_size, cell_size)
            if i == init[0] and j == init[1]: #Start cell
                pygame.draw.rect(screen, YELLOW, rect)
                pygame.draw.circle(screen, RED, (state[1]*cell_size+25,state[0]*cell_size+25), 5)
            elif maze[i,j] == 0:  # Maze cell
                pygame.draw.rect(screen, BLACK, rect)
            elif maze[i,j] == 1:  #Wall
                pygame.draw.rect(screen, BLUE, rect)
            elif maze[i,j] == 2: #Goal
                pygame.draw.rect(screen, GREEN, rect)
    
    # Choose the first action using an epsilon-greedy policy
    action = choose_action(state)
    
    # Initialize the total reward
    total_reward = 0
    
    # Repeat until the goal state is reached
    while state != goal_state:
        # Move to the next state
        delta = action_deltas[actions.index(action)]
        next_state = (state[0]+delta[0], state[1]+delta[1])
        
        # If the next state is outside the maze, set it to the current state
        if next_state[0] < 0 or next_state[0] >= maze.shape[0] or next_state[1] < 0 or next_state[1] >= maze.shape[1]:
            next_state = state
        
        # Get the reward for the next state
        reward = rewards[maze[next_state[0], next_state[1]]]
    
        # Update the Q-value using SARSA
        next_action = choose_action(next_state)
        if training == True:
            update_q_value(state, action, reward, next_state, next_action)

        # Update the total reward and current state and action
        total_reward += reward
        state = next_state
        action = next_action

        # Draw the current state of the agent
        for i in range(maze.shape[0]):
            for j in range(maze.shape[1]):
                if i == state[0] and j == state[1]: #Next cell
                    pygame.draw.circle(screen, RED, (state[1]*cell_size+25,state[0]*cell_size+25), 5)

    return total_reward

In [10]:
#Initialize Pygame
pygame.init()

#Define the size of each cell in the maze and the size of the window
cell_size = 50
window_size = (maze.shape[1]*cell_size, maze.shape[0]*cell_size)

#Create the window
screen = pygame.display.set_mode(window_size)
pygame.display.set_caption("Maze")
clock = pygame.time.Clock()

# Train the agent for the specified number of episodes
for i in range(30):
    total_reward = run_episode()
    
    # Print the total reward for the episode
    print(f"Episode {i+1}: Total Reward = {total_reward}")
    
    # Update the Pygame window
    pygame.display.flip()
    
    # Pause briefly to allow the user to see the visualization
    #pygame.time.wait(3000)

# Wait for the user to close the Pygame window
while True:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            pygame.quit()
            sys.exit()

Episode 1: Total Reward = -172
Episode 2: Total Reward = -2
Episode 3: Total Reward = 2
Episode 4: Total Reward = -5
Episode 5: Total Reward = -17
Episode 6: Total Reward = -47
Episode 7: Total Reward = 7
Episode 8: Total Reward = 0
Episode 9: Total Reward = 0
Episode 10: Total Reward = 9
Episode 11: Total Reward = 5
Episode 12: Total Reward = 10
Episode 13: Total Reward = 9
Episode 14: Total Reward = 10
Episode 15: Total Reward = 3
Episode 16: Total Reward = 6
Episode 17: Total Reward = 9
Episode 18: Total Reward = 9
Episode 19: Total Reward = -1
Episode 20: Total Reward = -1
Episode 21: Total Reward = 9
Episode 22: Total Reward = 5
Episode 23: Total Reward = 6
Episode 24: Total Reward = -6
Episode 25: Total Reward = 7
Episode 26: Total Reward = 9
Episode 27: Total Reward = 8
Episode 28: Total Reward = 4
Episode 29: Total Reward = 6
Episode 30: Total Reward = 4


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
#Initialize Pygame
pygame.init()

#Define the size of each cell in the maze and the size of the window
cell_size = 50
window_size = (maze.shape[1]*cell_size, maze.shape[0]*cell_size)

#Create the window
screen = pygame.display.set_mode(window_size)
pygame.display.set_caption("Maze")
clock = pygame.time.Clock()

# Test the agent for the specified number of episodes
for i in range(10):
    total_reward = run_episode(training=False)
    
    # Print the total reward for the episode
    print(f"Episode {i+1}: Total Reward = {total_reward}")
    
    # Update the Pygame window
    pygame.display.flip()
    
    # Pause briefly to allow the user to see the visualization
    pygame.time.wait(3000)

# Wait for the user to close the Pygame window
while True:
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            pygame.quit()
            sys.exit()

Episode 1: Total Reward = 10
Episode 2: Total Reward = -46
Episode 3: Total Reward = 9
Episode 4: Total Reward = 10
Episode 5: Total Reward = 7
Episode 6: Total Reward = 3
Episode 7: Total Reward = 9
Episode 8: Total Reward = -118
Episode 9: Total Reward = -150
Episode 10: Total Reward = 10
