<a href="https://colab.research.google.com/github/AlirezaRamezaney/MARL-Cournot-game/blob/main/MARL_CODE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import gym
import numpy as np
from gym import spaces

class CournotCompetitionEnv(gym.Env):
    def init(self, agent_id, n_agents=4):
        super(CournotCompetitionEnv, self).__init__()
        self.n_agents = n_agents
        self.agent_id = agent_id
        self.action_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Box(low=0, high=1, shape=(2,), dtype=np.float32)
        self.last_total_output = 0.5
        self.last_reward = 0.0

    def step(self, action):
        self.actions[self.agent_id] = float(np.clip(action, 0, 1))
        if all(self.actions[i] is not None for i in range(self.n_agents)):
            total_output = sum(self.actions)
            price = max(1 - total_output, 0)
            rewards = [q * price - 0.1 * q for q in self.actions]
            obs = np.array([total_output, rewards[self.agent_id]], dtype=np.float32)
            reward = rewards[self.agent_id]
            self.last_total_output = total_output
            self.last_reward = reward
            done = True
            self.actions = [None] * self.n_agents
            return obs, reward, done, {}
        else:
            return np.array([self.last_total_output, self.last_reward], dtype=np.float32), 0.0, False, {}

    def reset(self):
        self.last_total_output = 0.5
        self.last_reward = 0.0
        self.actions = [None] * self.n_agents
        return np.array([self.last_total_output, self.last_reward], dtype=np.float32)

In [None]:
from stable_baselines3 import PPO, DDPG
from stable_baselines3.common.env_util import DummyVecEnv

n_agents = 4
models = []

# Agent 0 and 1 use PPO, 2 and 3 use DDPG
for i in range(n_agents):
    env = DummyVecEnv([lambda i=i: CournotCompetitionEnv(agent_id=i)])
    model = PPO("MlpPolicy", env, verbose=0) if i < 2 else DDPG("MlpPolicy", env, verbose=0)
    model.learn(total_timesteps=100_000)
    models.append(model)

In [None]:
import matplotlib.pyplot as plt
n_agents = 4
actions_history = [[] for _ in range(n_agents)]
rewards_history = [[] for _ in range(n_agents)]

envs = [CournotCompetitionEnv(agent_id=i) for i in range(n_agents)]
obs = [env.reset() for env in envs]

for _ in range(500):
    actions = [float(models[i].predict(obs[i], deterministic=True)[0]) for i in range(n_agents)]
    results = [envs[i].step(actions[i]) for i in range(n_agents)]
    obs = [r[0] for r in results]
    for i in range(n_agents):
        actions_history[i].append(actions[i])
        rewards_history[i].append(results[i][1])

plt.figure(figsize=(12,5))
for i in range(n_agents):
    plt.plot(actions_history[i], label=f"Agent {i} ({'PPO' if i<2 else 'DDPG'})")
plt.axhline(0.2, color='black', linestyle='--', label='Nash q=0.2')
plt.title("Agent Actions Over Time")
plt.xlabel("Episode")
plt.ylabel("Quantity (Action)")
plt.legend()
plt.show()