In [1]:
import numpy as np
import random

class GridWorld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.state = (0, 0)
        self.goal = (3, 3)
        self.actions = [0, 1, 2, 3]

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0:   x = max(0, x - 1)
        elif action == 1: y = min(self.cols - 1, y + 1)
        elif action == 2: x = min(self.rows - 1, x + 1)
        elif action == 3: y = max(0, y - 1)
        self.state = (x, y)
        if self.state == self.goal: return self.state, 10, True
        else: return self.state, -1, False

def run_marl():
    # Simulate 2 independent agents with 2 Q-tables
    Q1 = np.zeros((4, 4, 4))
    Q2 = np.zeros((4, 4, 4))
    
    env1 = GridWorld()
    env2 = GridWorld()
    
    # Initialize Positions
    p1 = env1.reset()
    env2.state = (0, 3) # Agent 2 starts at top right
    p2 = env2.state
    
    alpha = 0.1
    gamma = 0.9
    
    print("Running MARL (Independent Q-Learning)...")
    for _ in range(500):
        # --- AGENT 1 Logic ---
        if random.random() < 0.1: a1 = random.choice([0,1,2,3])
        else: a1 = np.argmax(Q1[p1])
        
        np1, r1, d1 = env1.step(a1)
        Q1[p1][a1] += alpha * (r1 + gamma * np.max(Q1[np1]) - Q1[p1][a1])
        
        # --- AGENT 2 Logic ---
        if random.random() < 0.1: a2 = random.choice([0,1,2,3])
        else: a2 = np.argmax(Q2[p2])
        
        np2, r2, d2 = env2.step(a2)
        Q2[p2][a2] += alpha * (r2 + gamma * np.max(Q2[np2]) - Q2[p2][a2])
        
        # Resets
        if d1: p1 = env1.reset()
        else: p1 = np1
        
        if d2: 
            env2.state = (0,3)
            p2 = env2.state
        else: p2 = np2
        
    print("MARL Finished.")
    print("Agent 1 Q(0,0):", np.round(Q1[0,0], 2))
    print("Agent 2 Q(0,3):", np.round(Q2[0,3], 2))

if __name__ == "__main__":
    run_marl()

Running MARL (Independent Q-Learning)...
MARL Finished.
Agent 1 Q(0,0): [-1.8  -1.73 -1.71 -1.74]
Agent 2 Q(0,3): [ 0.08  0.    6.2  -0.59]


In [2]:
# exp13_marl_gridworld.py

import numpy as np
import random

class GridWorld4x4:
    def __init__(self, max_steps=100):
        self.n_rows = 4
        self.n_cols = 4
        self.n_states = self.n_rows * self.n_cols
        self.n_actions = 4
        self.max_steps = max_steps

    def state_to_xy(self, s):
        return (s // self.n_cols, s % self.n_cols)

    def xy_to_state(self, r, c):
        return r * self.n_cols + c

    def step_from(self, s, action):
        r, c = self.state_to_xy(s)
        if action == 0:
            r = max(0, r - 1)
        elif action == 1:
            c = min(self.n_cols - 1, c + 1)
        elif action == 2:
            r = min(self.n_rows - 1, r + 1)
        elif action == 3:
            c = max(0, c - 1)
        ns = self.xy_to_state(r, c)
        reward = -1  # step penalty; goal handled in MARL env
        return ns, reward

class MultiAgentGridWorld:
    def __init__(self, base_env, start_positions=(0, 3), goal_positions=(12, 15), max_steps=50):
        self.env = base_env
        self.start_positions = list(start_positions)
        self.goal_positions = list(goal_positions)
        self.n_agents = len(start_positions)
        self.max_steps = max_steps
        self.reset()

    def reset(self):
        self.positions = self.start_positions.copy()
        self.steps = 0
        return tuple(self.positions)

    def step(self, actions):
        rewards = [0.0 for _ in range(self.n_agents)]
        next_positions = []

        for i in range(self.n_agents):
            s = self.positions[i]
            a = actions[i]
            ns, r = self.env.step_from(s, a)
            # base reward
            reward = r
            # extra reward for reaching goal
            if ns == self.goal_positions[i]:
                reward += 10.0
            rewards[i] = reward
            next_positions.append(ns)

        # collision penalty
        if next_positions[0] == next_positions[1]:
            rewards[0] -= 5.0
            rewards[1] -= 5.0

        self.positions = next_positions
        self.steps += 1
        dones = [False for _ in range(self.n_agents)]
        for i in range(self.n_agents):
            if self.positions[i] == self.goal_positions[i] or self.steps >= self.max_steps:
                dones[i] = True
        done_all = all(dones)
        return tuple(self.positions), rewards, dones, {"done_all": done_all}

def epsilon_greedy(Q, s, n_actions, eps):
    if random.random() < eps:
        return random.randrange(n_actions)
    return int(np.argmax(Q[s]))

def independent_q_learning(menv, episodes=2000, alpha=0.5, gamma=0.95, eps=0.2):
    base_env = menv.env
    Qs = [np.zeros((base_env.n_states, base_env.n_actions)) for _ in range(menv.n_agents)]

    for ep in range(episodes):
        state = menv.reset()
        done_all = False
        while not done_all:
            actions = []
            for i in range(menv.n_agents):
                s_i = state[i]
                a_i = epsilon_greedy(Qs[i], s_i, base_env.n_actions, eps)
                actions.append(a_i)
            next_state, rewards, dones, info = menv.step(actions)
            for i in range(menv.n_agents):
                s_i = state[i]
                a_i = actions[i]
                r_i = rewards[i]
                ns_i = next_state[i]
                Qs[i][s_i, a_i] = Qs[i][s_i, a_i] + alpha * (
                    r_i + gamma * np.max(Qs[i][ns_i]) - Qs[i][s_i, a_i]
                )
            state = next_state
            done_all = info["done_all"]

    policies = [np.argmax(Qs[i], axis=1) for i in range(menv.n_agents)]
    return Qs, policies

if __name__ == "__main__":
    base_env = GridWorld4x4()
    menv = MultiAgentGridWorld(base_env, start_positions=(0, 3), goal_positions=(12, 15))
    Qs, policies = independent_q_learning(menv, episodes=3000)
    print("Agent 0 policy (0:U,1:R,2:D,3:L):")
    print(policies[0].reshape(4, 4))
    print("\nAgent 1 policy (0:U,1:R,2:D,3:L):")
    print(policies[1].reshape(4, 4))


Agent 0 policy (0:U,1:R,2:D,3:L):
[[2 3 3 3]
 [2 2 2 0]
 [2 3 3 3]
 [2 3 0 3]]

Agent 1 policy (0:U,1:R,2:D,3:L):
[[1 1 2 2]
 [3 1 1 2]
 [1 1 1 2]
 [1 1 1 1]]
