In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define the environment
maze = [
    [0, 0, 0, 0],
    [0, -1, 0, -1],
    [0, 0, 0, -1],
    [0, -1, -1, 1]  # Fire = -1, Flag = 1
]

maze = np.array(maze)

# Define parameters
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995
min_epsilon = 0.01
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor

# Define Q-table
actions = ['up', 'down', 'left', 'right']
num_actions = len(actions)
q_table = np.zeros((maze.shape[0], maze.shape[1], num_actions))

# Define helper functions
def is_valid_move(x, y):
    return 0 <= x < maze.shape[0] and 0 <= y < maze.shape[1] and maze[x, y] != -1

def get_next_state(x, y, action):
    if action == 'up' and is_valid_move(x - 1, y):
        return x - 1, y
    elif action == 'down' and is_valid_move(x + 1, y):
        return x + 1, y
    elif action == 'left' and is_valid_move(x, y - 1):
        return x, y - 1
    elif action == 'right' and is_valid_move(x, y + 1):
        return x, y + 1
    return x, y  # Invalid moves result in no movement

def get_reward(x, y):
    return maze[x, y]

# Train the Q-learning model
num_episodes = 1000
for episode in range(num_episodes):
    x, y = 0, 0  # Start position
    done = False

    while not done:
        # Choose action
        if np.random.random() < epsilon:
            action_idx = np.random.choice(num_actions)  # Explore
        else:
            action_idx = np.argmax(q_table[x, y])  # Exploit

        # Take action
        next_x, next_y = get_next_state(x, y, actions[action_idx])
        reward = get_reward(next_x, next_y)

        # Update Q-value
        best_next_action = np.argmax(q_table[next_x, next_y])
        q_table[x, y, action_idx] = q_table[x, y, action_idx] + alpha * (
            reward + gamma * q_table[next_x, next_y, best_next_action] - q_table[x, y, action_idx]
        )

        # Transition to next state
        x, y = next_x, next_y

        # Check if the episode is done
        if reward == 1:  # Reached the flag
            done = True

    # Decay epsilon
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

# Visualize the Q-table
for i in range(maze.shape[0]):
    for j in range(maze.shape[1]):
        print(f"State ({i}, {j}): {q_table[i, j]}")

# Save the Q-table as an image
plt.figure(figsize=(10, 10))
plt.title("Final Q-table")
plt.imshow(np.max(q_table, axis=2), cmap="coolwarm", interpolation="nearest")
plt.colorbar(label="Max Q-value")
plt.savefig("final_q_table.png")
plt.show()
