## Mean-field Simulations

In [1]:
import itertools
import numpy as np
from tqdm import tqdm

### Constant Parameters

In [2]:
n_states = 2 # number of states
n_actions_const = 2 # all agents have the same action spaces

In [3]:
def get_initial_state():
    # initial state distribution
    return np.random.choice(range(n_states)) # uniform over states for now 

In [4]:
def transition_state(x, us):
    '''returns next state given current state and action'''
    
    # switches state if majority of agents "affirm"
    mean_u = np.average(us)
    if mean_u > 0.5:
        return (x+1)%2
    else:
        return x

In [5]:
r = 1
def calc_alpha(n):
    return 1/(n**r)

In [6]:
def mf_reward(x, u_i, mean_u):
    return float(x==1) # state 1 is desirable

def gen_reward_i(i):
    def reward_i(state, actions):
        ui = actions[i]
        mean_u = np.average(actions)
        return mf_reward(state, ui, mean_u)
    
    return reward_i

In [7]:
# # temp problem: this one didn't converge to correct sln... (is it weakly acyclic)
# def tmp_reward(x, us):
#     return float(x==1)


# def tmp_transition_state(x, us):
#     # switch to other state if u0 = 0, u1 = 1
#     if us[0]==0 and us[1]==1:
#         return (x+1)%2
#     else: return x

### N-Agent Simulations

In [8]:
def is_solution(agent_policies):
    state_0_correct = np.average(np.array(agent_policies)[:,0]) > 0.5
    state_1_correct = np.average(np.array(agent_policies)[:, 1]) <= 0.5
    return state_0_correct and state_1_correct

In [9]:
n_exploration_phases = 100 # number of exploration phases
T = 1000 # length of exploration phase

In [10]:
# works (i.e.: finds optimal set of policies) for 1, 2, 3, 4, 5 agents 
# optimal policy not found for 6 or 7 within K=100; T=1000
n_agents = 5 # number of agents

n_Us = [n_actions_const] * n_agents # number of actions per agent (both agents have two actions)

experimentation_probs = [0.01]*n_agents # probability of experimentation at each action (\rho)
inertias = [0.5] * n_agents # inertias of each agent (\lambda)
betas = [0.9]*n_agents # discount factor
deltas = [1e-6]*n_agents # (in paper it's 0) [this is critical; does T depend on this?]

In [11]:
reward_funcs = [gen_reward_i(i) for i in range(n_agents)]
# reward_funcs = [tmp_reward]*n_agents

In [12]:
from multi_agent_learning import q_learning_alg1

In [13]:
Qs, agent_policies, (policy_history, Qs_history, is_BR_history) = q_learning_alg1(n_Us, n_states, reward_funcs, betas,
                                                                    get_initial_state, transition_state, 
                                                                    n_exploration_phases, T, experimentation_probs,
                                                                    calc_alpha, deltas, inertias)

100%|███████████████████████████| 100/100 [00:17<00:00,  5.80it/s]


In [14]:
agent_policies

[(0, 1), (1, 0), (1, 0), (1, 0), (0, 0)]

In [15]:
is_solution(agent_policies)

True

In [16]:
Qs

[array([[4.39605526, 0.        ],
        [5.63385177, 5.37018495]]),
 array([[0.        , 4.64136554],
        [5.90917197, 5.61060177]]),
 array([[0.        , 4.26106302],
        [5.26925633, 3.41403899]]),
 array([[0.        , 4.18302523],
        [5.1888442 , 4.55361192]]),
 array([[4.21709082, 0.        ],
        [5.22068507, 4.38969588]])]

In [22]:
[(np.argmax(Qs[i][x,:]) for x in range(n_states)) for i in range(n_agents)]

[<generator object <listcomp>.<genexpr> at 0x000002DDF4675BA0>,
 <generator object <listcomp>.<genexpr> at 0x000002DDF4675890>,
 <generator object <listcomp>.<genexpr> at 0x000002DDF4675820>,
 <generator object <listcomp>.<genexpr> at 0x000002DDF4675C80>,
 <generator object <listcomp>.<genexpr> at 0x000002DDF4675AC0>]

In [17]:
is_BR_history[-1]

[False, False, True, True, True]

In [18]:
policy_history[-10:]

[[(1, 0), (1, 0), (1, 1), (1, 1), (0, 1)],
 [(1, 0), (1, 0), (1, 0), (0, 0), (1, 0)],
 [(1, 0), (1, 0), (1, 0), (0, 0), (1, 0)],
 [(1, 0), (1, 1), (1, 0), (0, 0), (1, 0)],
 [(1, 1), (1, 1), (1, 1), (0, 0), (1, 0)],
 [(0, 0), (0, 0), (1, 0), (1, 0), (0, 0)],
 [(0, 0), (0, 1), (1, 1), (1, 0), (0, 0)],
 [(0, 0), (1, 1), (1, 0), (1, 0), (0, 0)],
 [(0, 1), (1, 1), (1, 0), (1, 0), (0, 0)],
 [(0, 1), (1, 1), (1, 0), (1, 0), (0, 0)]]

In [19]:
np.all(is_BR_history, axis=1)

array([False, False, False,  True, False, False,  True, False, False,
       False, False,  True,  True, False, False, False, False, False,
        True, False, False, False, False, False,  True, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False,  True, False, False,  True, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [20]:
policies_check = np.array(policy_history)[np.all(is_BR_history, axis=1)]
is_correct = [is_solution(p) for p in policies_check]

In [21]:
is_correct

[True, True, True, True, True, True, True, True, True, True, True, True, True]