In [1]:
import numpy as np
import random

# Define the grid world environment
GRID_SIZE = 5
REWARD_GOAL = 10
REWARD_STEP = -1
EPISODES = 1000  # Number of training episodes
ALPHA = 0.5  # Learning rate
GAMMA = 0.9  # Discount factor

actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # Up, Down, Left, Right
action_names = ['U', 'D', 'L', 'R']
num_actions = len(actions)

def is_valid(state):
    """Check if the state is within the grid boundaries."""
    x, y = state
    return 0 <= x < GRID_SIZE and 0 <= y < GRID_SIZE

def get_next_state(state, action):
    """Get the next state given the current state and action."""
    next_state = (state[0] + action[0], state[1] + action[1])
    if is_valid(next_state):
        return next_state
    return state  # If the action leads out of bounds, remain in the same state

# Initialize rewards and Q-table
rewards = np.full((GRID_SIZE, GRID_SIZE), REWARD_STEP)
rewards[GRID_SIZE - 1, GRID_SIZE - 1] = REWARD_GOAL  # Goal state

Q = np.zeros((GRID_SIZE, GRID_SIZE, num_actions))  # Q-table

def choose_action(state, epsilon):
    """Epsilon-greedy policy for action selection."""
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, num_actions - 1)  # Explore
    else:
        x, y = state
        return np.argmax(Q[x, y])  # Exploit

# Q-learning algorithm
def q_learning():
    for episode in range(EPISODES):
        state = (0, 0)  # Start state
        epsilon = max(0.1, 1 - episode / (EPISODES / 2))  # Decreasing epsilon

        while state != (GRID_SIZE - 1, GRID_SIZE - 1):
            x, y = state

            # Choose action using epsilon-greedy policy
            action_index = choose_action(state, epsilon)
            action = actions[action_index]

            # Get next state and reward
            next_state = get_next_state(state, action)
            nx, ny = next_state
            reward = rewards[nx, ny]

            # Update Q-value
            Q[x, y, action_index] += ALPHA * (
                reward + GAMMA * np.max(Q[nx, ny]) - Q[x, y, action_index]
            )

            # Transition to the next state
            state = next_state

def print_q_values():
    """Print the learned Q-values for each state and action."""
    for x in range(GRID_SIZE):
        for y in range(GRID_SIZE):
            print(f"State ({x}, {y}):", {action_names[i]: Q[x, y, i] for i in range(num_actions)})

# Train the agent and display the results
q_learning()
print("Learned Q-values:")
print_q_values()

Learned Q-values:
State (0, 0): {'U': -1.3906558000000007, 'D': -0.4340620000000006, 'L': -1.3906558000000007, 'R': -0.4340620000000006}
State (0, 1): {'U': -0.4340620000000094, 'D': 0.6288199999999993, 'L': -1.3906558000000007, 'R': 0.6288199999999993}
State (0, 2): {'U': 0.6288199999926539, 'D': 1.8097999999938381, 'L': -0.43406200000001105, 'R': 1.8097999999999992}
State (0, 3): {'U': 1.8097999999900245, 'D': 3.121999999999999, 'L': 0.628819999999326, 'R': 3.121999999484834}
State (0, 4): {'U': 3.1211145877283344, 'D': 4.579999999998473, 'L': 1.8097999977283479, 'R': 3.1219999587323155}
State (1, 0): {'U': -1.3906558000000007, 'D': 0.6288199999999993, 'L': -0.4340620000000006, 'R': 0.6288199999999993}
State (1, 1): {'U': -0.4340620000000006, 'D': 1.8097999999999992, 'L': -0.4340620000000006, 'R': 1.8097999999999992}
State (1, 2): {'U': 0.6288199999999974, 'D': 3.121999999999999, 'L': 0.6288199999999993, 'R': 3.121999999999999}
State (1, 3): {'U': 1.8097999999999983, 'D': 4.579999999