In [19]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [20]:
# ─────────────────────────────────────────────
# Parameters
# ─────────────────────────────────────────────
N             = 3       # Number of agents (keep odd)
ROUNDS        = 50000     # Total rounds to simulate
MEMORY        = 5        # History length m (state space = 2^m)
ALPHA         = 0.1      # Learning rate
GAMMA         = 0.9      # Discount factor
EPSILON       = 0.2      # Initial exploration rate
EPSILON_DECAY = 0.997    # Multiplicative decay per round
EPSILON_MIN   = 0.01     # Floor for exploration
WINDOW        = 50       # Rolling average window
SEED          = 42       # For reproducibility

In [26]:
import numpy as np
import matplotlib.pyplot as plt
from collections import deque

# ----------------------------
# Minority Game Environment (N=3)
# ----------------------------
class MinorityGame:
    """
    N=3, actions in {0,1}.
    Minority side (strictly fewer agents) gets reward 1, others 0.
    Public history signal: minority_action (0/1) if exists else 0.
    History length = m bits -> integer state in [0, 2^m - 1].
    """
    def __init__(self, m=5, seed=0):
        self.n_agents = 3
        self.m = m
        self.base_states = 2 ** m
        self.rng = np.random.default_rng(seed)
        self.reset()

    def reset(self):
        self.t = 0
        self.hist = deque([0] * self.m, maxlen=self.m)
        return self._hist_to_int()

    def _hist_to_int(self):
        s = 0
        for b in self.hist:
            s = (s << 1) | int(b)
        return s

    def step(self, actions):
        actions = np.asarray(actions, dtype=int)
        ssum = int(actions.sum())

        rewards = np.zeros(3, dtype=float)
        minority_action = None

        # N=3: minority exists iff sum==1 (minority=1) or sum==2 (minority=0)
        if ssum == 1:
            minority_action = 1
        elif ssum == 2:
            minority_action = 0

        if minority_action is not None:
            winners = (actions == minority_action)
            rewards[winners] = 1.0

        # public signal for next state (0 if no minority)
        signal = 0 if minority_action is None else int(minority_action)
        self.hist.append(signal)

        self.t += 1
        next_state = self._hist_to_int()
        return next_state, rewards, minority_action


# ----------------------------
# Basic Q-learning Agent
# ----------------------------
class QAgent:
    def __init__(self, n_states, n_actions=2, alpha=0.1, gamma=0.95,
                 eps_start=1.0, eps_end=0.05, eps_decay=0.999, seed=0):
        self.Q = np.zeros((n_states, n_actions), dtype=float)
        self.alpha = float(alpha)
        self.gamma = float(gamma)
        self.eps = float(eps_start)
        self.eps_start = float(eps_start)
        self.eps_end = float(eps_end)
        self.eps_decay = float(eps_decay)
        self.n_actions = int(n_actions)
        self.rng = np.random.default_rng(seed)

    def act(self, s):
        if self.rng.random() < self.eps:
            return int(self.rng.integers(self.n_actions))
        q = self.Q[s]
        max_q = np.max(q)
        max_actions = np.flatnonzero(np.isclose(q, max_q))
        return int(self.rng.choice(max_actions))

    def update(self, s, a, r, s_next):
        td_target = r + self.gamma * np.max(self.Q[s_next])
        self.Q[s, a] += self.alpha * (td_target - self.Q[s, a])

    def decay_eps(self):
        self.eps = max(self.eps_end, self.eps * self.eps_decay)


# ----------------------------
# Train: independent Q-learning (all update every step)
# ----------------------------
def run_basic_qlearning(
    m=3,
    steps=100000,
    seed=0,
    alpha=0.12,
    gamma=0.95,
    eps_start=0.8,
    eps_end=0.02,
    eps_decay=0.9997,
    log_window=2000
):
    env = MinorityGame(m=m, seed=seed)
    n_states = env.base_states

    agents = [
        QAgent(n_states, alpha=alpha, gamma=gamma,
               eps_start=eps_start, eps_end=eps_end, eps_decay=eps_decay,
               seed=seed + 10 + i)
        for i in range(3)
    ]

    s = env.reset()
    rewards_hist = np.zeros((steps, 3), dtype=float)

    for t in range(steps):
        actions = [ag.act(s) for ag in agents]
        s_next, rewards, _ = env.step(actions)

        # all agents update every step (independent learners)
        for i, ag in enumerate(agents):
            ag.update(s, actions[i], rewards[i], s_next)
            ag.decay_eps()

        rewards_hist[t] = rewards
        s = s_next

    # rolling win-rate
    win_rate = np.zeros((steps, 3), dtype=float)
    for i in range(3):
        csum = np.cumsum(rewards_hist[:, i])
        for t in range(steps):
            lo = max(0, t - log_window + 1)
            total = csum[t] - (csum[lo - 1] if lo > 0 else 0.0)
            win_rate[t, i] = total / (t - lo + 1)

    return rewards_hist, win_rate


if __name__ == "__main__":
    rewards_hist, win_rate = run_basic_qlearning(
        m=5,
        steps=50000,
        seed=18,
        alpha=0.12,
        gamma=0.95,
        eps_start=0.8,
        eps_end=0.02,
        eps_decay=0.9997,
        log_window=2000
    )

    t = np.arange(rewards_hist.shape[0])

    plt.figure()
    plt.plot(t, win_rate[:, 0], label="agent 0 rolling win-rate")
    plt.plot(t, win_rate[:, 1], label="agent 1 rolling win-rate")
    plt.plot(t, win_rate[:, 2], label="agent 2 rolling win-rate")
    plt.axhline(1/3, linestyle="--", linewidth=1, label="target 1/3")
    plt.xlabel("time")
    plt.ylabel("win-rate")
    plt.legend()
    plt.title("Basic Independent Q-learning on Minority Game")
    plt.show()

    print("Final avg win-rate (last 5000 steps):", rewards_hist[-5000:].mean(axis=0))
    print("Total avg win-rate:", rewards_hist.mean(axis=0))