In [15]:
import numpy as np
import random
ROWS, COLS = 4, 4
ACTIONS = ['U', 'D', 'L', 'R']
ACTION_IDX = {'U': 0, 'D': 1, 'L': 2, 'R': 3}
# Hyperparameters
alpha = 0.1      # learning rate
gamma = 0.9      # discount factor
epsilon = 0.1    # exploration rate
episodes = 500
# Reward structure
def get_reward(state):
    return 10 if state == (3, 3) else -1
# Get next state from current state and action
def step(state, action):
    i, j = state
    if action == 'U' and i > 0: i -= 1
    elif action == 'D' and i < ROWS - 1: i += 1
    elif action == 'L' and j > 0: j -= 1
    elif action == 'R' and j < COLS - 1: j += 1
    return (i, j)
# Initialize Q-table
Q = np.zeros((ROWS, COLS, len(ACTIONS)))
# Q-Learning Algorithm
for ep in range(episodes):
    state = (0, 0)
    while state != (3, 3):
        if random.uniform(0, 1) < epsilon:
            action = random.choice(ACTIONS)
        else:
            action = ACTIONS[np.argmax(Q[state[0], state[1]])]
        next_state = step(state, action)
        reward = get_reward(next_state)
        a_idx = ACTION_IDX[action]
        # Q-value update
        Q[state[0], state[1], a_idx] = Q[state[0], state[1], a_idx] + alpha * (
            reward + gamma * np.max(Q[next_state[0], next_state[1]]) - Q[state[0], state[1], a_idx]
        )
        state = next_state
# Extract policy from Q-table
policy = np.full((ROWS, COLS), ' ')
for i in range(ROWS):
    for j in range(COLS):
        if (i, j) == (3, 3):
            policy[i, j] = 'G'  # Goal
        else:
            best_action = ACTIONS[np.argmax(Q[i, j])]
            policy[i, j] = best_action
# Display results
print("Learned Policy (after training):")
for row in policy:
    print(' '.join(row))


Learned Policy (after training):
R D D D
R R D D
R R R D
R R R G
