###### Implement Reinforcement Learning using an example of a maze environment that the agent needs to explore. 

In [2]:
import numpy as np
import random

# Define the maze environment
maze = [
    [0, -1, 0, 0, 0],
    [0, -1, 0, -1, 0],
    [0, 0, 0, -1, 0],
    [0, -1, 0, 0, 0],
    [0, 0, 0, -1, 10]  # Goal is the cell with a reward of +10
]

# Parameters
num_rows, num_cols = 5, 5  # Maze dimensions
actions = ['up', 'down', 'left', 'right']
epsilon = 0.9  # Exploration rate
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
num_episodes = 1000  # Number of episodes

# Initialize Q-table
Q_table = np.zeros((num_rows, num_cols, len(actions)))

# Helper functions
def is_valid_move(x, y):
    return 0 <= x < num_rows and 0 <= y < num_cols and maze[x][y] != -1

def get_next_state(x, y, action):
    if action == 'up':
        return (x - 1, y) if is_valid_move(x - 1, y) else (x, y)
    elif action == 'down':
        return (x + 1, y) if is_valid_move(x + 1, y) else (x, y)
    elif action == 'left':
        return (x, y - 1) if is_valid_move(x, y - 1) else (x, y)
    elif action == 'right':
        return (x, y + 1) if is_valid_move(x, y + 1) else (x, y)
    return x, y

# Training the agent
for episode in range(num_episodes):
    x, y = 0, 0  # Starting position
    done = False
    
    while not done:
        # Choose an action (with epsilon-greedy strategy)
        if random.uniform(0, 1) < epsilon:
            action_index = random.randint(0, len(actions) - 1)  # Explore
        else:
            action_index = np.argmax(Q_table[x, y])  # Exploit
        
        action = actions[action_index]
        
        # Take the action and observe the reward
        next_x, next_y = get_next_state(x, y, action)
        reward = maze[next_x][next_y]
        
        # Update Q-table
        old_value = Q_table[x, y, action_index]
        next_max = np.max(Q_table[next_x, next_y])
        
        # Q-learning formula
        Q_table[x, y, action_index] = old_value + alpha * (reward + gamma * next_max - old_value)
        
        # Update state
        x, y = next_x, next_y
        
        # Check if the goal is reached
        if reward == 10:
            done = True

    # Decay epsilon over time
    epsilon = max(0.1, epsilon * 0.99)

print("Training complete! Q-Table:")
print(Q_table)


Training complete! Q-Table:
[[[4.07858705e+00 4.78296900e+00 4.12258226e+00 4.07336268e+00]
  [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
  [1.47628670e-02 1.12204559e+00 6.47648353e-02 9.12077368e-03]
  [8.01244413e-03 9.13238703e-04 2.04338346e-02 9.97503302e-02]
  [4.44005319e-02 6.14097001e-01 6.11335922e-03 7.64159670e-04]]

 [[3.96732717e+00 5.31441000e+00 4.72653931e+00 4.64079392e+00]
  [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
  [2.20530095e-01 6.46795008e+00 1.56147528e+00 3.01543114e-01]
  [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
  [3.66263416e-02 4.44735293e+00 8.62721343e-01 1.98886102e-01]]

 [[4.67717358e+00 4.37400806e+00 5.26271269e+00 5.90490000e+00]
  [5.82284244e+00 5.77781717e+00 5.22906484e+00 6.56100000e+00]
  [5.46187359e+00 7.29000000e+00 5.76939139e+00 6.39872949e+00]
  [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
  [1.52648120e+00 8.98162765e+00 2.75481715e+00 3.48321285e+00]]

 [[5.2