In [None]:
import numpy as np

# Create the gridworld environment (1 = obstacle, 0 = empty space, G = goal)
gridworld = np.array([
    [0, 0, 0, 1, 0],
    [0, 1, 0, 1, 0],
    [0, 1, 0, 0, 0],
    [0, 0, 0, 1, 0],
    [1, 1, 0, 0, 'G']
])

start_state = (0, 0)
goal_state = (4, 4)


In [None]:
action_space = {
    0: 'up',
    1: 'down',
    2: 'left',
    3: 'right'
}

def is_terminal(state):
    return state == goal_state

def get_reward(state):
    if state == goal_state:
        return 100  # Goal
    elif gridworld[state] == 1:
        return -10  # Obstacle
    else:
        return -1  # Standard step


In [None]:
import random

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate

# Initialize Q-table
q_table = np.zeros((gridworld.shape[0], gridworld.shape[1], len(action_space)))

# Get next state based on action
def next_state(state, action):
    x, y = state
    if action == 0:  # Up
        x = max(x - 1, 0)
    elif action == 1:  # Down
        x = min(x + 1, gridworld.shape[0] - 1)
    elif action == 2:  # Left
        y = max(y - 1, 0)
    elif action == 3:  # Right
        y = min(y + 1, gridworld.shape[1] - 1)

    return (x, y)

# Choose action using epsilon-greedy policy
def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return random.choice(list(action_space.keys()))  # Explore
    else:
        return np.argmax(q_table[state])  # Exploit

# Q-learning algorithm
def q_learning():
    for episode in range(1000):  # Train for 1000 episodes
        state = start_state

        while not is_terminal(state):
            action = choose_action(state)
            next_s = next_state(state, action)
            reward = get_reward(next_s)

            # Q-value update
            q_table[state][action] = q_table[state][action] + alpha * (
                reward + gamma * np.max(q_table[next_s]) - q_table[state][action]
            )

            # Update state
            state = next_s

# Train the agent
q_learning()


In [None]:
def test_agent():
    state = start_state
    steps = 0
    path = [state]

    while not is_terminal(state):
        action = np.argmax(q_table[state])
        state = next_state(state, action)
        path.append(state)
        steps += 1

    print(f"Agent reached the goal in {steps} steps!")
    print("Path taken:", path)

# Test the agent
test_agent()


Agent reached the goal in 8 steps!
Path taken: [(0, 0), (0, 1), (0, 2), (1, 2), (2, 2), (3, 2), (3, 3), (3, 4), (4, 4)]


 #Limitations and Possible Extensions

😭Limitations:

1.Q-learning requires significant time for large environments.
2.Does not work well with continuous state/action spaces.
3.Exploration-exploitation trade-off can be tricky.

😃Possible Extensions:

1.Implement Deep Q-Networks (DQN) to handle continuous spaces.
2.Use dynamic obstacles or changing environments to make the problem more complex.