## Convergence to Equilibria

In [1]:
import itertools
import numpy as np
from tqdm import tqdm

## Symmetric Prisoner's Dilemma

### Game Setup: A simple prisoner's dilemma

In [2]:
def get_initial_state():
    # initial state distribution
    return 0 # (there are no states)

In [3]:
reward_matrix = np.array([[0, -1], [-1, 1]])

def reward(state, actions):
    u0, u1 = actions
    return reward_matrix[u0, u1]
    

reward_funcs = [reward, reward]

In [4]:
def transition_state(x, us):
    '''returns next state given current state and action'''
    return x

In [5]:
r = 0.751 # 0.51
def calc_alpha(n):
    return 1/(n**r)

In [6]:
n_states = 1 # number of states
n_agents = 2 # number of agents

n_Us = [2] * n_agents # number of actions per agent (both agents have two actions)

experimentation_probs = [0.1]*n_agents # probability of experimentation at each action (\rho)
inertias = [0.25, 0.75] # inertias of each agent (\lambda)
betas = [0.9]*n_agents # discount factor
deltas = [0]*n_agents # (in paper it's 0) [this is critical; does T depend on this?]
n_exploration_phases = 100 # number of exploration phases
T = 1000 # length of exploration phase

### Run Simulations

In [7]:
from multi_agent_learning import q_learning_alg1

In [8]:
# Qs, agent_policies, (policy_history, Qs_history, is_BR_history) = q_learning_alg1(n_Us, n_states, reward_funcs, betas,
#                                                                     get_initial_state, transition_state, 
#                                                                     n_exploration_phases, T, experimentation_probs,
#                                                                     calc_alpha, deltas, inertias)

K, Qs, agent_policies, (policy_history, Qs_history, is_BR_history) = q_learning_alg1(n_Us, n_states, reward_funcs, betas,
                                                                    get_initial_state, transition_state, 
                                                                    n_exploration_phases, T, experimentation_probs,
                                                                    calc_alpha, deltas, inertias, early_stopping=True)

  2%|█▋                                                                                | 2/100 [00:00<00:12,  7.60it/s]


In [9]:
Qs

[array([[-0.24883636, -1.19108541]]), array([[-0.34648114, -1.24413879]])]

In [10]:
n_actions = 2
agent_policy_space = list(itertools.product(range(n_actions), repeat=n_states))

joint_policy_space = list(itertools.product(agent_policy_space, repeat=n_agents))

In [11]:
# end policies
end_data = {p: [] for p in joint_policy_space}
transition_data = {p: [] for p in joint_policy_space}

In [12]:
# disable inner tqdm
from tqdm import tqdm
from functools import partialmethod

tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)

n_trials = 1000
for _ in tqdm(range(n_trials), disable=False):
    K, Qs, agent_policies, (policy_history, Qs_history, is_BR_history) = q_learning_alg1(n_Us, n_states, reward_funcs, betas,
                                                                    get_initial_state, transition_state, 
                                                                    n_exploration_phases, T, experimentation_probs,
                                                                    calc_alpha, deltas, inertias, early_stopping=True)

    init_policy = tuple(policy_history[0])
    end_data[init_policy].append(tuple(agent_policies))
    
    for p1, p2 in zip(policy_history[:-1], policy_history[1:]):
        transition_data[tuple(p1)].append(tuple(p2))

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:49<00:00,  5.90it/s]


In [13]:
markov_end_prob = {p1: {} for p1 in joint_policy_space}

for p1 in joint_policy_space:
    arr = np.array(end_data[p1])
    print('initial_policy = ', p1)
    for p2 in joint_policy_space:
        P12 = np.average(np.all(arr==p2, axis=1))
        markov_end_prob[p1][p2] = P12
        print(f'prob ending at {p2} = {P12}')
    print()

initial_policy =  ((0,), (0,))
prob ending at ((0,), (0,)) = 1.0
prob ending at ((0,), (1,)) = 0.0
prob ending at ((1,), (0,)) = 0.0
prob ending at ((1,), (1,)) = 0.0

initial_policy =  ((0,), (1,))
prob ending at ((0,), (0,)) = 0.26582278481012656
prob ending at ((0,), (1,)) = 0.0
prob ending at ((1,), (0,)) = 0.0
prob ending at ((1,), (1,)) = 0.7341772151898734

initial_policy =  ((1,), (0,))
prob ending at ((0,), (0,)) = 0.7529880478087649
prob ending at ((0,), (1,)) = 0.0
prob ending at ((1,), (0,)) = 0.0
prob ending at ((1,), (1,)) = 0.24701195219123506

initial_policy =  ((1,), (1,))
prob ending at ((0,), (0,)) = 0.0
prob ending at ((0,), (1,)) = 0.0
prob ending at ((1,), (0,)) = 0.0
prob ending at ((1,), (1,)) = 1.0



In [14]:
markov_transition_prob = {p1: {} for p1 in joint_policy_space}

for p1 in joint_policy_space:
    arr = np.array(transition_data[p1])
    print('initial_policy = ', p1)
    if len(arr) > 0:
        for p2 in joint_policy_space:
            P12 = np.average(np.all(arr==p2, axis=1))
            markov_transition_prob[p1][p2] = P12
            print(f'prob transitioning to {p2} = {P12}')
    else:
        for p2 in joint_policy_space:
            P12 = 1 if p1==p2 else 0
            markov_transition_prob[p1][p2] = P12
            print(f'prob transitioning to {p2} = {P12}')
        
    print()


initial_policy =  ((0,), (0,))
prob transitioning to ((0,), (0,)) = 1
prob transitioning to ((0,), (1,)) = 0
prob transitioning to ((1,), (0,)) = 0
prob transitioning to ((1,), (1,)) = 0

initial_policy =  ((0,), (1,))
prob transitioning to ((0,), (0,)) = 0.06896551724137931
prob transitioning to ((0,), (1,)) = 0.17771883289124668
prob transitioning to ((1,), (0,)) = 0.19363395225464192
prob transitioning to ((1,), (1,)) = 0.5596816976127321

initial_policy =  ((1,), (0,))
prob transitioning to ((0,), (0,)) = 0.5794871794871795
prob transitioning to ((0,), (1,)) = 0.18717948717948718
prob transitioning to ((1,), (0,)) = 0.16923076923076924
prob transitioning to ((1,), (1,)) = 0.0641025641025641

initial_policy =  ((1,), (1,))
prob transitioning to ((0,), (0,)) = 0
prob transitioning to ((0,), (1,)) = 0
prob transitioning to ((1,), (0,)) = 0
prob transitioning to ((1,), (1,)) = 1



starting at an equilibrium => remain at that equilibrium \
starting at non-equilibrium => end up at either equilibrium w/ equal probability \
(as expected / BR graph is symmetric w.r.t. team optimal equilibrium and nash equilibrium)


In [118]:
# theoretical transition matrix from BR graph (inertia=0.5)
markov_matrix = np.array([[1,0,0,0], [0.25]*4, [0.25]*4, [0,0,0,1]])
markov_matrix

array([[1.  , 0.  , 0.  , 0.  ],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.  , 0.  , 0.  , 1.  ]])

In [15]:
# empirical transition matrix
np.array([[markov_transition_prob[p1][p2] for p2 in joint_policy_space] for p1 in joint_policy_space])

array([[1.        , 0.        , 0.        , 0.        ],
       [0.06896552, 0.17771883, 0.19363395, 0.5596817 ],
       [0.57948718, 0.18717949, 0.16923077, 0.06410256],
       [0.        , 0.        , 0.        , 1.        ]])

In [137]:
# theoretical end probabilities
np.linalg.matrix_power(markov_matrix, 100) # ~ lim T^N (N-> oo)

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.00000000e-01, 3.94430453e-31, 3.94430453e-31, 5.00000000e-01],
       [5.00000000e-01, 3.94430453e-31, 3.94430453e-31, 5.00000000e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])

In [16]:
np.array([[markov_end_prob[p1][p2] for p2 in joint_policy_space] for p1 in joint_policy_space])

array([[1.        , 0.        , 0.        , 0.        ],
       [0.26582278, 0.        , 0.        , 0.73417722],
       [0.75298805, 0.        , 0.        , 0.24701195],
       [0.        , 0.        , 0.        , 1.        ]])

## Problem 2: Asymetric Prisoner's Dilemma

### Game setup

In [187]:
# just redefine reward funcs

reward_matrix = np.array([[0, 0], [-1, 1]])

def reward(state, actions):
    u0, u1 = actions
    return reward_matrix[u0, u1]
    

reward_funcs = [reward, reward]

In [215]:
# also need delta to be strictly positive, otherwise will transition even if at equilibrium
deltas = [0.05]*n_agents # (in paper it's 0) [this is critical; does T depend on this?]

### Run Simulations

In [38]:
n_actions = 2
agent_policy_space = list(itertools.product(range(n_actions), repeat=n_states))

joint_policy_space = list(itertools.product(agent_policy_space, repeat=n_agents))

# end policies
end_data = {p: [] for p in joint_policy_space}
transition_data = {p: [] for p in joint_policy_space}

In [217]:
# disable inner tqdm
from tqdm import tqdm
from functools import partialmethod

tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)

n_trials = 1000
for _ in tqdm(range(n_trials), disable=False):
    K, Qs, agent_policies, (policy_history, Qs_history, is_BR_history) = q_learning_alg1(n_Us, n_states, reward_funcs, betas,
                                                                    get_initial_state, transition_state, 
                                                                    n_exploration_phases, T, experimentation_probs,
                                                                    calc_alpha, deltas, inertias, early_stopping=True)

    init_policy = tuple(policy_history[0])
    end_data[init_policy].append(tuple(agent_policies))
    
    for p1, p2 in zip(policy_history[:-1], policy_history[1:]):
        transition_data[tuple(p1)].append(tuple(p2))

100%|████████████████████████████████████████████████████████████████████| 1000/1000 [03:41<00:00,  4.51it/s]


In [218]:
# calculate frequency/probability of ending at each policy given a starting policy

markov_end_prob = {p1: {} for p1 in joint_policy_space}

for p1 in joint_policy_space:
    arr = np.array(end_data[p1])
    print('initial_policy = ', p1)
    for p2 in joint_policy_space:
        P12 = np.average(np.all(arr==p2, axis=1))
        markov_end_prob[p1][p2] = P12
        print(f'prob ending at {p2} = {P12}')
    print()

initial_policy =  ((0,), (0,))
prob ending at ((0,), (0,)) = 0.18875502008032127
prob ending at ((0,), (1,)) = 0.0
prob ending at ((1,), (0,)) = 0.0
prob ending at ((1,), (1,)) = 0.8112449799196787

initial_policy =  ((0,), (1,))
prob ending at ((0,), (0,)) = 0.0
prob ending at ((0,), (1,)) = 0.0
prob ending at ((1,), (0,)) = 0.0
prob ending at ((1,), (1,)) = 1.0

initial_policy =  ((1,), (0,))
prob ending at ((0,), (0,)) = 0.045454545454545456
prob ending at ((0,), (1,)) = 0.0
prob ending at ((1,), (0,)) = 0.0
prob ending at ((1,), (1,)) = 0.9545454545454546

initial_policy =  ((1,), (1,))
prob ending at ((0,), (0,)) = 0.0
prob ending at ((0,), (1,)) = 0.0
prob ending at ((1,), (0,)) = 0.0
prob ending at ((1,), (1,)) = 1.0



In [219]:
# calculate frequency/probability of transitioning to each policy given a starting policy


markov_transition_prob = {p1: {} for p1 in joint_policy_space}

for p1 in joint_policy_space:
    arr = np.array(transition_data[p1])
    print('initial_policy = ', p1)
    if len(arr) > 0:
        for p2 in joint_policy_space:
            P12 = np.average(np.all(arr==p2, axis=1))
            markov_transition_prob[p1][p2] = P12
            print(f'prob transitioning to {p2} = {P12}')
    else:
        for p2 in joint_policy_space:
            P12 = 1 if p1==p2 else 0
            markov_transition_prob[p1][p2] = P12
            print(f'prob transitioning to {p2} = {P12}')
        
    print()


initial_policy =  ((0,), (0,))
prob transitioning to ((0,), (0,)) = 0.4847328244274809
prob transitioning to ((0,), (1,)) = 0.5152671755725191
prob transitioning to ((1,), (0,)) = 0.0
prob transitioning to ((1,), (1,)) = 0.0

initial_policy =  ((0,), (1,))
prob transitioning to ((0,), (0,)) = 0.0
prob transitioning to ((0,), (1,)) = 0.46562786434463793
prob transitioning to ((1,), (0,)) = 0.0
prob transitioning to ((1,), (1,)) = 0.534372135655362

initial_policy =  ((1,), (0,))
prob transitioning to ((0,), (0,)) = 0.215633423180593
prob transitioning to ((0,), (1,)) = 0.23450134770889489
prob transitioning to ((1,), (0,)) = 0.2884097035040431
prob transitioning to ((1,), (1,)) = 0.261455525606469

initial_policy =  ((1,), (1,))
prob transitioning to ((0,), (0,)) = 0
prob transitioning to ((0,), (1,)) = 0
prob transitioning to ((1,), (0,)) = 0
prob transitioning to ((1,), (1,)) = 1



In [4]:
# theoretical transition matrix from BR graph (inertia=0.5)
markov_matrix = np.array([[1,0,0,0], [0, 0.5, 0, 0.5], [0.25]*4, [0,0,0,1]])
markov_matrix

array([[1.  , 0.  , 0.  , 0.  ],
       [0.  , 0.5 , 0.  , 0.5 ],
       [0.25, 0.25, 0.25, 0.25],
       [0.  , 0.  , 0.  , 1.  ]])

In [221]:
# empirical transition matrix
np.array([[markov_transition_prob[p1][p2] for p2 in joint_policy_space] for p1 in joint_policy_space])

array([[0.48473282, 0.51526718, 0.        , 0.        ],
       [0.        , 0.46562786, 0.        , 0.53437214],
       [0.21563342, 0.23450135, 0.2884097 , 0.26145553],
       [0.        , 0.        , 0.        , 1.        ]])

In [222]:
# theoretical end probabilities
np.linalg.matrix_power(markov_matrix, 100) # ~ lim T^N (N-> oo)

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 7.88860905e-31, 0.00000000e+00, 1.00000000e+00],
       [3.33333333e-01, 7.88860905e-31, 6.22301528e-61, 6.66666667e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])

In [223]:
np.array([[markov_end_prob[p1][p2] for p2 in joint_policy_space] for p1 in joint_policy_space])

array([[0.18875502, 0.        , 0.        , 0.81124498],
       [0.        , 0.        , 0.        , 1.        ],
       [0.04545455, 0.        , 0.        , 0.95454545],
       [0.        , 0.        , 0.        , 1.        ]])

In [1]:
import sympy

In [15]:
markov_matrix_sym = sympy.Matrix(markov_matrix)
markov_matrix_sym

Matrix([
[ 1.0,  0.0,  0.0,  0.0],
[ 0.0,  0.5,  0.0,  0.5],
[0.25, 0.25, 0.25, 0.25],
[ 0.0,  0.0,  0.0,  1.0]])

In [32]:
(markov_matrix_sym**100).evalf(5)

Matrix([
[    1.0,          0,         0,       0],
[      0, 7.8886e-31,         0,     1.0],
[0.33333, 7.8886e-31, 6.223e-61, 0.66667],
[      0,          0,         0,     1.0]])

In [None]:
(markov_matrix_sym**100).evalf

In [12]:
invar_meas = sympy.Matrix(sympy.symbols('p0:4')).T
invar_meas

Matrix([[p0, p1, p2, p3]])

In [17]:
sympy.solve(invar_meas@markov_matrix_sym - invar_meas)

{p1: 0.0, p2: 0.0}

In [19]:
invar_meas@markov_matrix_sym - invar_meas

Matrix([[0.25*p2, -0.5*p1 + 0.25*p2, -0.75*p2, 0.5*p1 + 0.25*p2]])

In [39]:
joint_policy_space

[((0,), (0,)), ((0,), (1,)), ((1,), (0,)), ((1,), (1,))]

In [40]:
agent_policy_space

[(0,), (1,)]