# Title: Reinforcement Learning

<b>Implement Reinforcement Learning using an example of a maze environment that the 
agent needs to explore</b>.

In [2]:
import numpy as np

In [3]:
# Define the maze environment
maze = np.array([
    [0, 0, 0, 0, 0],
    [0, -1, -1, -1, 0],
    [0, 0, 0, 0, 0],
    [0, -1, -1, -1, 0],
    [0, 0, 0, 0, 0],
])

In [4]:
start_state = (0, 0)
goal_state = (4, 4)

In [5]:
# Define hyperparameters
epsilon = 0.1           # Exploration rate
learning_rate = 0.5     # Learning rate
discount_factor = 0.9   # Discount factor
max_episodes = 1000     # Max number of episodes
max_steps = 100         # Max steps per episode

In [6]:
# Initialize the Q-table
num_rows, num_cols = maze.shape
num_actions = 4  # 4 possible actions: up, down, left, right
Q = np.zeros((num_rows, num_cols, num_actions))

In [7]:
# Q-learning algorithm
for episode in range(max_episodes):
    state = start_state
    total_reward = 0
    
    for step in range(max_steps):
        # Choose an action using epsilon-greedy policy
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.randint(num_actions)  # Random action
        else:
            action = np.argmax(Q[state[0], state[1]])  # Best action based on Q-table
        
        # Take the chosen action and observe the next state and reward
        if action == 0:  # Up
            next_state = (state[0] - 1, state[1])
        elif action == 1:  # Down
            next_state = (state[0] + 1, state[1])
        elif action == 2:  # Left
            next_state = (state[0], state[1] - 1)
        elif action == 3:  # Right
            next_state = (state[0], state[1] + 1)

        # Check if the next state is valid (within bounds)
        if 0 <= next_state[0] < num_rows and 0 <= next_state[1] < num_cols:
            # Reward for moving to the next state
            reward = maze[next_state]
        else:
            # Penalize for going out of bounds
            reward = -5
            next_state = state  # Stay in the current state if out of bounds

        # Update the Q-value for the previous state-action pair
        Q[state[0], state[1], action] = Q[state[0], state[1], action] + learning_rate * (
            reward + discount_factor * np.max(Q[next_state[0], next_state[1]]) - Q[state[0], state[1], action]
        )
        
        # Update the current state
        state = next_state
        total_reward += reward

        # Check if the agent reached the goal state
        if state == goal_state:
            break
    
    print(f"Episode: {episode+1}, Steps: {step+1}, Total Reward: {total_reward}")

Episode: 1, Steps: 100, Total Reward: -36
Episode: 2, Steps: 100, Total Reward: -11
Episode: 3, Steps: 100, Total Reward: -6
Episode: 4, Steps: 100, Total Reward: -5
Episode: 5, Steps: 100, Total Reward: -12
Episode: 6, Steps: 100, Total Reward: -10
Episode: 7, Steps: 100, Total Reward: -15
Episode: 8, Steps: 100, Total Reward: -16
Episode: 9, Steps: 100, Total Reward: -15
Episode: 10, Steps: 100, Total Reward: -5
Episode: 11, Steps: 100, Total Reward: -5
Episode: 12, Steps: 100, Total Reward: -6
Episode: 13, Steps: 100, Total Reward: -25
Episode: 14, Steps: 100, Total Reward: -25
Episode: 15, Steps: 100, Total Reward: -20
Episode: 16, Steps: 100, Total Reward: -12
Episode: 17, Steps: 100, Total Reward: -47
Episode: 18, Steps: 100, Total Reward: -15
Episode: 19, Steps: 100, Total Reward: -36
Episode: 20, Steps: 100, Total Reward: -29
Episode: 21, Steps: 100, Total Reward: -21
Episode: 22, Steps: 100, Total Reward: -11
Episode: 23, Steps: 100, Total Reward: -24
Episode: 24, Steps: 100, 

In [8]:
# Testing the learned policy
state = start_state
steps = 0
success_count = 0

In [9]:
while state != goal_state and steps < max_steps:
    action = np.argmax(Q[state[0], state[1]])  # Choose the best action
    if action == 0:  # Up
        next_state = (state[0] - 1, state[1])
    elif action == 1:  # Down
        next_state = (state[0] + 1, state[1])
    elif action == 2:  # Left
        next_state = (state[0], state[1] - 1)
    elif action == 3:  # Right
        next_state = (state[0], state[1] + 1)

    # Check if the next state is valid (within bounds)
    if 0 <= next_state[0] < num_rows and 0 <= next_state[1] < num_cols:
        state = next_state
        steps += 1
        if state == goal_state:
            success_count += 1
    else:
        break

print(f"Testing Results - Steps: {steps}, Success Rate: {success_count / max_steps}")

Testing Results - Steps: 100, Success Rate: 0.0
