# Brick 1: Constants and Setup
Here we define the rules of the game.
- **0** means Cooperate.
- **1** means Defect.
- We also define the **Payoff Matrix** (the points we get).

In [None]:
!pip install gymnasium==1.1.1 moviepy==1.0.3

In [1]:
import gymnasium as gym
import numpy as np
import random

# 1. Define Actions for readability
COOPERATE = 0
DEFECT = 1

# 2. Define Rewards (Points for the Agent)
# Format: (Agent Action, Opponent Action): Reward
PAYOFF_MATRIX = {
    (COOPERATE, COOPERATE): 3, # Reward
    (COOPERATE, DEFECT):    0, # Sucker
    (DEFECT, COOPERATE):    5, # Temptation
    (DEFECT, DEFECT):       1, # Punishment
}

print("Setup Complete! Actions and Rewards defined.")

Setup Complete! Actions and Rewards defined.


# Brick 2: Opponent Strategies
This function decides the opponent's move based on the history of the game.
- **ALL_C**: Always nice.
- **ALL_D**: Always mean.
- **TFT (Tit-for-Tat)**: Copies your last move.
- **Imperfect TFT**: Copies your move 90% of the time, messes up 10% of the time.

In [2]:
def get_opponent_action(strategy, history):
    """
    Decides the opponent's move.
    history: A list of tuples, e.g., [(my_move, their_move), (my_move, their_move)]
    """

    # Strategy 1: Always Cooperate
    if strategy == "ALL_C":
        return COOPERATE

    # Strategy 2: Always Defect
    if strategy == "ALL_D":
        return DEFECT

    # Strategy 3: Tit-for-Tat (Copy my last move)
    if strategy == "TFT":
        if len(history) == 0:
            return COOPERATE # TFT starts nice

        # Look at the last round (index -1).
        # In history (my_move, opp_move), my_move is at index 0.
        my_last_move = history[-1][0]
        return my_last_move

    # Strategy 4: Imperfect Tit-for-Tat (10% chance of error)
    if strategy == "IMPERFECT_TFT":
        if len(history) == 0:
            return COOPERATE # Start nice

        # Calculate what standard TFT would do
        my_last_move = history[-1][0]
        intended_action = my_last_move

        # 10% chance to flip the action (Slip)
        if random.random() < 0.10:
            # Return the opposite (0 becomes 1, 1 becomes 0)
            return 1 - intended_action
        else:
            return intended_action

    return COOPERATE # Default fallback

# Brick 3: The Game Environment (The Class)
This class puts everything together.
- **__init__**: Sets up the game options (Opponent type, Memory length).
- **step**: Plays one round (Agent moves -> Opponent moves -> Calculate Score -> Update History).
- **reset**: Wipes the memory clean to start a new game.

In [3]:
class PrisonerDilemmaEnv(gym.Env):
    def __init__(self, opponent_strategy, memory_length=1):
        super().__init__()
        self.opponent_strategy = opponent_strategy
        self.memory_length = memory_length

        # Define Action Space (0 or 1)
        self.action_space = gym.spaces.Discrete(2)

        # Internal memory to track the game history
        self.history = []

    def reset(self, seed=None, options=None):
        """
        Resets the game to the starting state.
        The assignment says to assume everyone cooperated before the game started.
        """
        super().reset(seed=seed)
        self.history = [] # Clear history

        # Create the "Pre-game" history based on memory length
        # If memory is 1, we pretend the last round was (C, C)
        # If memory is 2, we pretend the last two rounds were (C, C), (C, C)
        for _ in range(self.memory_length):
            self.history.append((COOPERATE, COOPERATE))

        return self._get_state(), {}

    def step(self, action):
        """
        Plays one round of the game.
        """
        # 1. Agent makes a move (passed in as 'action')

        # 2. Opponent makes a move (using our helper function from Brick 2)
        opp_action = get_opponent_action(self.opponent_strategy, self.history)

        # 3. Calculate Reward (using the Matrix from Brick 1)
        reward = PAYOFF_MATRIX[(action, opp_action)]

        # 4. Update History
        # Add the new round to the end
        self.history.append((action, opp_action))

        # Remove the oldest round so we only keep what we need for memory
        if len(self.history) > self.memory_length:
            self.history.pop(0)

        # 5. Get the new State
        state = self._get_state()

        # In this specific assignment, the game repeats indefinitely (no "Game Over")
        # We handle the loop length in the experiment section.
        terminated = False
        truncated = False

        return state, reward, terminated, truncated, {}

    def _get_state(self):
        """
        Helper to format the current history as a Tuple (so it's easy to read).
        Example Memory-1: ((0, 0),)
        """
        return tuple(self.history)

print("Environment Class Defined Successfully!")

Environment Class Defined Successfully!


# Brick 4: Sanity Check
Let's play 5 rounds against a **Tit-for-Tat** opponent to see if it works.
We will play: C, D, D, C, C.
Since the opponent is TFT, they should copy our move from the *previous* turn.

In [4]:
# 1. Create the game environment
env = PrisonerDilemmaEnv(opponent_strategy="TFT", memory_length=1)

# 2. Reset to start
state, _ = env.reset()
print(f"Start State: {state}")

# 3. Play a few manual moves
my_moves = [COOPERATE, DEFECT, DEFECT, COOPERATE, COOPERATE]

for move in my_moves:
    # Take step
    state, reward, done, _, _ = env.step(move)

    # Translate numbers to words for printing
    move_name = "Cooperate" if move == 0 else "Defect"

    print(f"I played: {move_name} | New State: {state} | Reward: {reward}")

Start State: ((0, 0),)
I played: Cooperate | New State: ((0, 0),) | Reward: 3
I played: Defect | New State: ((1, 0),) | Reward: 5
I played: Defect | New State: ((1, 1),) | Reward: 1
I played: Cooperate | New State: ((0, 1),) | Reward: 0
I played: Cooperate | New State: ((0, 0),) | Reward: 3
