In [None]:
import numpy as np
from rpd_env import RPDEnv, COOPERATE, DEFECT
from policy_iteration import policy_iteration
import matplotlib.pyplot as plt

In [None]:
print("--- 1. Discount Factor Analysis ---")
strategies = ['ALL-C', 'ALL-D', 'TFT', 'Imperfect-TFT']
gammas = np.linspace(0.1, 0.99, 20)

# Check when cooperation becomes optimal against TFT (Depth 1)
env = RPDEnv(opponent_strategy='TFT', memory_depth=1)
P, R = env.get_mdp()

coop_gamma = None
avg_rewards = []

for gamma in gammas:
    policy, _ = policy_iteration(P, R, gamma=gamma, states=env.states)
    
    # Simulation for plotting
    total_reward = 0
    for _ in range(10):
        state_idx, _ = env.reset()
        for _ in range(50):
            action = policy[state_idx]
            next_state_idx, reward, _, _, _ = env.step(action)
            total_reward += reward
            state_idx = next_state_idx
    avg_rewards.append(total_reward / 10)

    # Check if policy is all cooperate (0)
    if np.all(policy == COOPERATE):
        if coop_gamma is None:
            coop_gamma = gamma
        
print(f"Cooperation becomes optimal against TFT at Gamma >= {coop_gamma:.2f}")

# Plot
plt.figure(figsize=(10, 6))
plt.plot(gammas, avg_rewards, marker='o')
plt.title('Average Reward vs Discount Factor (against TFT)')
plt.xlabel('Discount Factor (Gamma)')
plt.ylabel('Average Cumulative Reward (50 steps)')
plt.grid(True)
plt.show()

In [None]:
print("\n--- 2. Memory Depth Analysis ---")
gamma = 0.9
strategies_list = []
mem1_rewards = []
mem2_rewards = []

for strategy in strategies:
    strategies_list.append(strategy)
    rewards = {}
    for depth in [1, 2]:
        env = RPDEnv(opponent_strategy=strategy, memory_depth=depth)
        P, R = env.get_mdp()
        policy, _ = policy_iteration(P, R, gamma=gamma, states=env.states)
        
        # Simulation
        total_reward = 0
        for _ in range(50):
            state_idx, _ = env.reset()
            for _ in range(50):
                action = policy[state_idx]
                next_state_idx, reward, _, _, _ = env.step(action)
                total_reward += reward
                state_idx = next_state_idx
        rewards[depth] = total_reward / 50
    
    mem1_rewards.append(rewards[1])
    mem2_rewards.append(rewards[2])
    
    print(f"Opponent: {strategy}")
    print(f"  Memory-1: {rewards[1]:.2f}")
    print(f"  Memory-2: {rewards[2]:.2f}")

# Plot
x = np.arange(len(strategies_list))
width = 0.35

plt.figure(figsize=(10, 6))
plt.bar(x - width/2, mem1_rewards, width, label='Memory-1')
plt.bar(x + width/2, mem2_rewards, width, label='Memory-2')

plt.xlabel('Opponent Strategy')
plt.ylabel('Average Cumulative Reward')
plt.title('Memory Depth Comparison')
plt.xticks(x, strategies_list)
plt.legend()
plt.grid(axis='y')
plt.show()

In [None]:
print("\n--- 3. Noise Analysis ---")
gamma = 0.9
for strategy in ['TFT', 'Imperfect-TFT']:
    env = RPDEnv(opponent_strategy=strategy, memory_depth=1)
    P, R = env.get_mdp()
    policy, _ = policy_iteration(P, R, gamma=gamma, states=env.states)
    
    print(f"Strategy: {strategy}")
    print("Optimal Policy:")
    for s_idx, action in enumerate(policy):
        state = env.idx_to_state[s_idx]
        action_str = "C" if action == COOPERATE else "D"
        print(f"  State {state}: {action_str}")