## 3IPD LLM Strategy Experiments

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.abspath(""), os.pardir)))

In [2]:
import time
import tqdm
from langchain.chat_models.openai import ChatOpenAI
from эipdai import IPDGame, Actions
from эipdai.strategies import *
from эipdai.tournament import *

In [3]:
def last_opponent_actions(obs):
    actions = obs.T[-1][1:]
    if any(actions < 0):
        return None
    else:
        return list(actions)

### Experiment 1
* Minimum background info

In [4]:
instructions = '''
Three-player iterated prisoner's dilemma is a game involving three players who repeatedly choose whether to cooperate or betray each other. Here's how it works:

Players: You, Player A, and Player B.
Actions: In each round, you have two choices: cooperate (0) or defect (1).
Rewards: The outcomes depend on everyone's choices:
* If all three players cooperate, they all receive a reward of 6.
* If one player cooperates while the other two defect, the defectors get a reward of 5 and the cooperator gets a payoff of 0.
* If two players cooperate while the third defects, the two cooperators receive a reward of 3, and the defector gets a high reward of 8.
* If all three players defect on each other, they all receive a low payoff of 2.

Your goal is to maximize your return, i.e. the sum of the rewards you receive.
I will give you an observation, reward and the return so far, formatted as:

Observation: <observation>
Reward: <reward>
Return: <return>

where <observation> will contain the actions of opponents from the last round, or None if the game has just started. 

You will respond with an action, formatted as:

Action: <action>

where you replace <action> with your actual action, either 0 or 1.
Do nothing else but return the action.
'''

In [5]:
llm_agent = LLMStrategy(
    ChatOpenAI(openai_api_key=os.environ['OPENAI_API_KEY']),
    instructions,
    obs_preprocessor=last_opponent_actions,
    name='LLM'
)

#### Run 1
* 50 rounds
* Naive opponents

In [9]:
environment = IPDGame(min_rounds=50, max_rounds=50)

In [8]:
players = [llm_agent, Naive(), Naive()]

In [10]:
for player in players:
    player.reset()

In [13]:
observations, infos = environment.reset()
rewards = {agent_id: None for agent_id in environment.possible_agents}
returns = {agent_id: 0 for agent_id in environment.possible_agents}

for i in tqdm.tqdm(range(50)):
    actions = {
        agent_id : players[i].play(observations[agent_id], rewards[agent_id]) 
        for i, agent_id in enumerate(environment.possible_agents)
    }

    observations, rewards, terminations, trunctations, infos = environment.step(actions)
    
    for agent_id in environment.possible_agents:
        returns[agent_id] += rewards[agent_id]
    
    time.sleep(1)

 98%|██████████████████████████████▍| 49/50 [01:40<00:02,  2.05s/it]


In [17]:
terminations

{'player_0': True, 'player_1': True, 'player_2': True}

In [14]:
observations

{'player_0': array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]]),
 'player_1': array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         

In [19]:
returns

{'player_0': 300, 'player_1': 300, 'player_2': 300}

#### Run 2
* 50 rounds
* Defector and Naive opponents

In [20]:
environment = IPDGame(min_rounds=50, max_rounds=50)

In [21]:
players = [llm_agent, Naive(), Defector()]

In [22]:
for player in players:
    player.reset()

In [23]:
observations, infos = environment.reset()
rewards = {agent_id: None for agent_id in environment.possible_agents}
returns = {agent_id: 0 for agent_id in environment.possible_agents}

for i in tqdm.tqdm(range(50)):
    actions = {
        agent_id : players[i].play(observations[agent_id], rewards[agent_id]) 
        for i, agent_id in enumerate(environment.possible_agents)
    }

    observations, rewards, terminations, trunctations, infos = environment.step(actions)
    
    for agent_id in environment.possible_agents:
        returns[agent_id] += rewards[agent_id]
    
    time.sleep(1)

100%|███████████████████████████████| 50/50 [01:35<00:00,  1.91s/it]


In [25]:
llm_agent._message_history

[SystemMessage(content="\nThree-player iterated prisoner's dilemma is a game involving three players who repeatedly choose whether to cooperate or betray each other. Here's how it works:\n\nPlayers: You, Player A, and Player B.\nActions: In each round, you have two choices: cooperate (0) or defect (1).\nRewards: The outcomes depend on everyone's choices:\n* If all three players cooperate, they all receive a reward of 6.\n* If one player cooperates while the other two defect, the defectors get a reward of 5 and the cooperator gets a payoff of 0.\n* If two players cooperate while the third defects, the two cooperators receive a reward of 3, and the defector gets a high reward of 8.\n* If all three players defect on each other, they all receive a low payoff of 2.\n\nYour goal is to maximize your return, i.e. the sum of the rewards you receive.\nI will give you an observation, reward and the return so far, formatted as:\n\nObservation: <observation>\nReward: <reward>\nReturn: <return>\n\nw

In [24]:
observations['player_0']

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1]])

In [26]:
returns

{'player_0': 150, 'player_1': 150, 'player_2': 400}

#### Run 3
* 50 rounds
* Defector opponents

In [28]:
environment = IPDGame(min_rounds=50, max_rounds=50)

In [29]:
players = [llm_agent, Defector(), Defector()]

In [30]:
for player in players:
    player.reset()

In [31]:
observations, infos = environment.reset()
rewards = {agent_id: None for agent_id in environment.possible_agents}
returns = {agent_id: 0 for agent_id in environment.possible_agents}

for i in tqdm.tqdm(range(50)):
    actions = {
        agent_id : players[i].play(observations[agent_id], rewards[agent_id]) 
        for i, agent_id in enumerate(environment.possible_agents)
    }

    observations, rewards, terminations, trunctations, infos = environment.step(actions)
    
    for agent_id in environment.possible_agents:
        returns[agent_id] += rewards[agent_id]
    
    time.sleep(1)

100%|███████████████████████████████| 50/50 [01:32<00:00,  1.85s/it]


In [32]:
observations['player_0']

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1]])

In [33]:
llm_agent._message_history

[SystemMessage(content="\nThree-player iterated prisoner's dilemma is a game involving three players who repeatedly choose whether to cooperate or betray each other. Here's how it works:\n\nPlayers: You, Player A, and Player B.\nActions: In each round, you have two choices: cooperate (0) or defect (1).\nRewards: The outcomes depend on everyone's choices:\n* If all three players cooperate, they all receive a reward of 6.\n* If one player cooperates while the other two defect, the defectors get a reward of 5 and the cooperator gets a payoff of 0.\n* If two players cooperate while the third defects, the two cooperators receive a reward of 3, and the defector gets a high reward of 8.\n* If all three players defect on each other, they all receive a low payoff of 2.\n\nYour goal is to maximize your return, i.e. the sum of the rewards you receive.\nI will give you an observation, reward and the return so far, formatted as:\n\nObservation: <observation>\nReward: <reward>\nReturn: <return>\n\nw

#### Run 4
* 50 rounds
* Defector and ToughT4T opponents

In [38]:
environment = IPDGame(min_rounds=50, max_rounds=50)

In [39]:
players = [llm_agent, Defector(), ToughT4T()]

In [40]:
for player in players:
    player.reset()

In [41]:
observations, infos = environment.reset()
rewards = {agent_id: None for agent_id in environment.possible_agents}
returns = {agent_id: 0 for agent_id in environment.possible_agents}

for i in tqdm.tqdm(range(50)):
    actions = {
        agent_id : players[i].play(observations[agent_id], rewards[agent_id]) 
        for i, agent_id in enumerate(environment.possible_agents)
    }

    observations, rewards, terminations, trunctations, infos = environment.step(actions)
    
    for agent_id in environment.possible_agents:
        returns[agent_id] += rewards[agent_id]
    
    time.sleep(1)

100%|███████████████████████████████| 50/50 [01:33<00:00,  1.86s/it]


In [42]:
observations['player_0']

array([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1]])

#### Run 5
* 50 rounds
* Grofman and DecayingT4T opponents

In [6]:
environment = IPDGame(min_rounds=50, max_rounds=50)

In [7]:
players = [llm_agent, Grofman(), DecayingT4T()]

In [8]:
for player in players:
    player.reset()

In [9]:
observations, infos = environment.reset()
rewards = {agent_id: None for agent_id in environment.possible_agents}
returns = {agent_id: 0 for agent_id in environment.possible_agents}

for i in tqdm.tqdm(range(50)):
    actions = {
        agent_id : players[i].play(observations[agent_id], rewards[agent_id]) 
        for i, agent_id in enumerate(environment.possible_agents)
    }

    observations, rewards, terminations, trunctations, infos = environment.step(actions)
    
    for agent_id in environment.possible_agents:
        returns[agent_id] += rewards[agent_id]
    
    time.sleep(1)

100%|██████████████████████████████████████████████| 50/50 [01:33<00:00,  1.88s/it]


In [11]:
observations['player_0']

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 1, 1, 1, 1]])

In [12]:
returns

{'player_0': 145, 'player_1': 205, 'player_2': 230}