In [1]:
import numpy as np
import random

In [3]:
# Define the gridworld environment
class GridWorld:
    def __init__(self):
        self.grid = np.array([
            [0, 0, 0, 1],  # Goal at (0, 3)
            [0, -1, 0, 0],  # Wall with reward -1
            [0, 0, 0, 0],
            [0, 0, 0, 0]  # Start at (3, 0)
        ])
        self.start_state = (3, 0)
        self.state = self.start_state

    def reset(self):
        self.state = self.start_state
        return self.state

    def is_terminal(self, state):
        return self.grid[state] == 1 or self.grid[state] == -1

    def get_next_state(self, state, action):
        next_state = list(state)
        if action == 0:  # Move up
            next_state[0] = max(0, state[0] - 1)
        elif action == 1:  # Move right
            next_state[1] = min(3, state[1] + 1)
        elif action == 2:  # Move down
            next_state[0] = min(3, state[0] + 1)
        elif action == 3:  # Move left
            next_state[1] = max(0, state[1] - 1)
        return tuple(next_state)

    def step(self, action):
        next_state = self.get_next_state(self.state, action)
        reward = self.grid[next_state]
        self.state = next_state
        done = self.is_terminal(next_state)
        return next_state, reward, done

In [4]:
#Q-learning agent
class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.9, exploration_rate=0.1):
        self.q_table = np.zeros((4, 4, 4))  # Q-values for each state-action pair
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate

    def choose_action(self, state):
        if random.uniform(0, 1) < self.exploration_rate:
            return random.randint(0, 3)  # Explore
        else:
            return np.argmax(self.q_table[state])  # Exploit

    def update_q_value(self, state, action, reward, next_state):
        max_future_q = np.max(self.q_table[next_state])  # Best Q-value for next state
        current_q = self.q_table[state][action]
        # Q-learning formula
        self.q_table[state][action] = current_q + self.learning_rate * (
            reward + self.discount_factor * max_future_q - current_q
        )

In [6]:
env = GridWorld()
agent = QLearningAgent()

episodes = 1000  # Number of training episodes

for episode in range(episodes):
    state = env.reset()  # Reset the environment at the start of each episode
    done = False

    while not done:
        action = agent.choose_action(state)  # Choose an action
        next_state, reward, done = env.step(action)  # Take the action and observe next state, reward
        agent.update_q_value(state, action, reward, next_state)  # Update Q-values
        state = next_state  # Move to the next state

In [7]:
# Print the learned Q-values after training
print("Learned Q-values:")
print(agent.q_table)
# Test the learned policy
state = env.reset()
done = False
while not done:
    action = agent.choose_action(state)
    next_state, reward, done = env.step(action)
    state = next_state
    print(f"State: {state}, Action: {action}, Reward: {reward}")
# Print the final state
print(f"Final State: {state}")

Learned Q-values:
[[[ 0.6302412   0.81        0.57620767  0.63392118]
  [ 0.64222494  0.9        -0.9282102   0.56937927]
  [ 0.84766518  1.          0.74311535  0.71151853]
  [ 0.          0.          0.          0.        ]]

 [[ 0.729      -0.92023356  0.53814665  0.45015541]
  [ 0.          0.          0.          0.        ]
  [ 0.89984312  0.0252      0.0024152  -0.271     ]
  [ 0.271       0.          0.          0.08095951]]

 [[ 0.6561      0.49508585  0.50325139  0.56107133]
  [-0.271       0.70388531  0.01679606  0.059049  ]
  [ 0.80417706  0.          0.01635983  0.07722434]
  [ 0.0171      0.          0.          0.        ]]

 [[ 0.59049     0.31368282  0.4815671   0.42865183]
  [ 0.51127169  0.          0.          0.        ]
  [ 0.18534253  0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]]
State: (2, 0), Action: 0, Reward: 0
State: (1, 0), Action: 0, Reward: 0
State: (0, 0), Action: 0, Reward: 0
State: (0, 1), Action: 1, Reward: 