## Mean-field Simulations

In [1]:
import itertools
import numpy as np
from tqdm import tqdm

from multi_agent_learning import q_learning_alg1

### Problem 1

#### Problem Setup

In [2]:
n_states = 2 # number of states
n_actions_const = 2 # all agents have the same action spaces

In [3]:
def get_initial_state():
    # initial state distribution
    return np.random.choice(range(n_states)) # uniform over states for now 

In [4]:
def transition_state(x, us):
    '''returns next state given current state and action'''
    
    # switches state if majority of agents "affirm"
    mean_u = np.average(us)
    if mean_u > 0.5:
        return (x+1)%2
    else:
        return x

In [5]:
r = 1
def calc_alpha(n):
    return 1/(n**r)

In [6]:
def mf_reward(x, u_i, mean_u):
    return float(x==1) # state 1 is desirable

def gen_reward_i(i):
    def reward_i(state, actions):
        ui = actions[i]
        mean_u = np.average(actions)
        return mf_reward(state, ui, mean_u)
    
    return reward_i

In [7]:
# # temp problem: this one didn't converge to correct sln... (is it weakly acyclic)
# def tmp_reward(x, us):
#     return float(x==1)


# def tmp_transition_state(x, us):
#     # switch to other state if u0 = 0, u1 = 1
#     if us[0]==0 and us[1]==1:
#         return (x+1)%2
#     else: return x

#### N-Agent Simulations

In [8]:
def is_solution(agent_policies):
    state_0_correct = np.average(np.array(agent_policies)[:,0]) > 0.5
    state_1_correct = np.average(np.array(agent_policies)[:, 1]) <= 0.5
    return state_0_correct and state_1_correct

In [9]:
n_exploration_phases = 100 # number of exploration phases
T = 1000 # length of exploration phase

In [10]:
# works (i.e.: finds optimal set of policies) for 1, 2, 3, 4, 5 agents 
# optimal policy not found for 6 or 7 within K=100; T=1000
n_agents = 6 # number of agents

n_Us = [n_actions_const] * n_agents # number of actions per agent (both agents have two actions)

experimentation_probs = [0.005]*n_agents # probability of experimentation at each action (\rho)
inertias = [0.5] * n_agents # inertias of each agent (\lambda)
betas = [0.9]*n_agents # discount factor
deltas = [1e-6]*n_agents # tolerance for suboptimality

In [11]:
reward_funcs = [gen_reward_i(i) for i in range(n_agents)]
# reward_funcs = [tmp_reward]*n_agents

In [12]:
Qs, agent_policies, (policy_history, Qs_history, is_BR_history) = q_learning_alg1(n_Us, n_states, reward_funcs, betas,
                                                                    get_initial_state, transition_state, 
                                                                    n_exploration_phases, T, experimentation_probs,
                                                                    calc_alpha, deltas, inertias)

100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:31<00:00,  3.15it/s]


In [13]:
agent_policies

[(0, 1), (1, 1), (0, 1), (1, 1), (0, 1), (1, 1)]

In [14]:
is_solution(agent_policies)

False

In [15]:
np.average([is_solution(p) for p in policy_history])

0.27

In [16]:
is_BR_history[-1]

[False, True, False, True, False, True]

In [17]:
Qs

[array([[0.43386376, 0.9255723 ],
        [0.        , 1.22956401]]),
 array([[0.03596483, 0.03928572],
        [0.        , 1.03077216]]),
 array([[0.79801055, 0.9       ],
        [0.        , 1.6075    ]]),
 array([[0.03568341, 0.03928271],
        [0.        , 1.03078034]]),
 array([[0.55331219, 0.92144118],
        [0.        , 1.42656029]]),
 array([[0.02920546, 0.04129203],
        [0.        , 1.03251056]])]

### Problem 2

#### Problem Setup

same problem as above except the transition now probabilistically depends on `mean_u` rather than being threshold-based

In [18]:
n_states = 2 # number of states
n_actions_const = 2 # all agents have the same action spaces

def get_initial_state():
    # initial state distribution
    return np.random.choice(range(n_states)) # uniform over states for now 

def transition_state(x, us):
    '''returns next state given current state and action'''
    
    # switches state with probability <u>
    mean_u = np.average(us)
    if np.random.random() < mean_u :
        return (x+1)%2
    else:
        return x

r = 1
def calc_alpha(n):
    return 1/(n**r)

def mf_reward(x, u_i, mean_u):
    return float(x==1) # state 1 is desirable

def gen_reward_i(i):
    def reward_i(state, actions):
        ui = actions[i]
        mean_u = np.average(actions)
        return mf_reward(state, ui, mean_u)
    
    return reward_i

#### N-agent Simulations

In [None]:
# note: get's closer to team optimal policy than problem 1, 
# but requires tinkering with parameters to reach the single optimal policy
# for larger N, convergence to the optimal policy becomes harder, but we remain around a near-optimal policy

# note also: depending on transition kernel, a mean-field formulation may result in low visitation to certain subset of 
# states, thus affecting convergence and learned Q-factors and policies for those states
# e.g.: in this case, if in state 1, and all actions are 0 in state 1, won't ever visit state 0 to learn the optimal policy
# and visitation to state 0 in that case requires exploration and has low probability.

In [22]:
n_exploration_phases = 100 # number of exploration phases
T = 10000 # length of exploration phase

In [23]:
#  w/ T=1000, K=100 found team optimal policies for n=2, 3
# w/ T=10,000 K=100 found team optimal policies for n=4
n_agents = 6 # number of agents

n_Us = [n_actions_const] * n_agents # number of actions per agent (both agents have two actions)

experimentation_probs = [0.05]*n_agents # probability of experimentation at each action (\rho)
inertias = [0.5] * n_agents # inertias of each agent (\lambda)
betas = [0.9]*n_agents # discount factor
deltas = [1e-6]*n_agents

In [24]:
reward_funcs = [gen_reward_i(i) for i in range(n_agents)]
# reward_funcs = [tmp_reward]*n_agents

In [25]:
Qs, agent_policies, (policy_history, Qs_history, is_BR_history) = q_learning_alg1(n_Us, n_states, reward_funcs, betas,
                                                                    get_initial_state, transition_state, 
                                                                    n_exploration_phases, T, experimentation_probs,
                                                                    calc_alpha, deltas, inertias)

100%|██████████████████████████████████████████████████████████████████████| 100/100 [12:08<00:00,  7.29s/it]


In [26]:
agent_policies

[(0, 0), (1, 1), (1, 0), (1, 1), (1, 1), (1, 0)]

In [27]:
policy_history[:10]

[[(0, 0), (1, 1), (1, 0), (0, 0), (0, 1), (0, 0)],
 [(0, 0), (1, 0), (1, 0), (1, 0), (0, 1), (1, 0)],
 [(0, 0), (1, 0), (1, 0), (1, 0), (0, 1), (1, 0)],
 [(0, 0), (1, 0), (1, 0), (1, 0), (0, 0), (1, 0)],
 [(1, 1), (1, 0), (1, 0), (1, 0), (0, 0), (1, 0)],
 [(1, 1), (1, 0), (1, 0), (1, 0), (0, 0), (1, 0)],
 [(1, 1), (1, 0), (1, 0), (1, 0), (0, 0), (1, 0)],
 [(1, 0), (1, 0), (1, 0), (1, 0), (0, 0), (1, 0)],
 [(1, 0), (1, 0), (1, 1), (1, 0), (0, 1), (1, 0)],
 [(1, 0), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0)]]

In [28]:
policy_history[-10:]

[[(0, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 1)],
 [(0, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 1)],
 [(0, 0), (1, 1), (1, 0), (1, 1), (1, 0), (1, 0)],
 [(1, 0), (1, 1), (1, 0), (1, 1), (1, 0), (0, 0)],
 [(1, 0), (1, 1), (1, 0), (1, 1), (1, 0), (0, 0)],
 [(1, 0), (1, 1), (0, 0), (1, 1), (1, 0), (1, 0)],
 [(0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (1, 0)],
 [(0, 0), (1, 1), (1, 0), (0, 0), (0, 1), (1, 0)],
 [(0, 0), (1, 0), (1, 0), (1, 0), (1, 1), (1, 0)],
 [(0, 0), (1, 0), (1, 0), (1, 0), (1, 1), (1, 0)]]

In [29]:
Qs

[array([[2.96220791, 0.        ],
        [6.61889065, 6.60564673]]),
 array([[0.        , 3.02008413],
        [6.73683547, 6.74328949]]),
 array([[0.        , 2.97083292],
        [6.63824011, 6.56825866]]),
 array([[0.        , 2.89305379],
        [6.46679011, 6.47451507]]),
 array([[0.        , 2.90334128],
        [6.48251763, 6.48965127]]),
 array([[0.        , 2.93422965],
        [6.51658295, 6.56714222]])]

### Problem 3

#### Problem Setup

same problem but now with base probability to switch state

In [98]:
n_states = 2 # number of states
n_actions_const = 2 # all agents have the same action spaces

def get_initial_state():
    # initial state distribution
    return np.random.choice(range(n_states)) # uniform over states for now 

base_switch_prob = 0.1
def transition_state(x, us):
    '''returns next state given current state and action'''
    
    # switch state regardless of input at a base probability
    # switches state with probability <u>
    mean_u = np.average(us)
    if np.random.random() < max(0.1, mean_u) :
        return (x+1)%2
    else:
        return x

r = 1
def calc_alpha(n):
    return 1/(n**r)

def mf_reward(x, u_i, mean_u):
    return float(x==1) # state 1 is desirable

def gen_reward_i(i):
    def reward_i(state, actions):
        ui = actions[i]
        mean_u = np.average(actions)
        return mf_reward(state, ui, mean_u)
    
    return reward_i

#### N-agent sims

In [None]:
# for large N, and this choice of parameters, we arrive at near-optimal policies, but not fully optimal policies
# why? would we arrive at the optimal policy for 

In [118]:
n_exploration_phases = 100 # number of exploration phases
T = 10000 # length of exploration phase

In [119]:
# converged to team optimal policy for n=
n_agents = 8 # number of agents

n_Us = [n_actions_const] * n_agents # number of actions per agent (both agents have two actions)

experimentation_probs = [0.01]*n_agents # probability of experimentation at each action (\rho)
inertias = [0.5] * n_agents # inertias of each agent (\lambda)
betas = [0.9]*n_agents # discount factor
deltas = [1e-6]*n_agents

In [120]:
reward_funcs = [gen_reward_i(i) for i in range(n_agents)]
# reward_funcs = [tmp_reward]*n_agents

In [121]:
Qs, agent_policies, (policy_history, Qs_history, is_BR_history) = q_learning_alg1(n_Us, n_states, reward_funcs, betas,
                                                                    get_initial_state, transition_state, 
                                                                    n_exploration_phases, T, experimentation_probs,
                                                                    calc_alpha, deltas, inertias)

100%|███████████████████████████| 100/100 [06:06<00:00,  3.66s/it]


In [122]:
agent_policies

[(1, 0), (1, 0), (1, 0), (1, 0), (0, 0), (0, 0), (1, 0), (0, 0)]

In [123]:
optimal_policy = [(1, 0) for _ in range(n_agents)]

# % of time spent at optimal policy
np.average([p == optimal_policy for p in policy_history])

0.0

In [124]:
policy_history[-10:]

[[(1, 0), (1, 0), (1, 0), (1, 0), (1, 1), (1, 0), (1, 1), (1, 0)],
 [(1, 1), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 1), (1, 0)],
 [(1, 0), (1, 1), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0)],
 [(0, 0), (1, 1), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0), (1, 0)],
 [(0, 0), (1, 1), (0, 0), (0, 0), (1, 0), (0, 0), (1, 0), (1, 0)],
 [(1, 0), (1, 1), (0, 0), (1, 0), (1, 0), (0, 0), (1, 0), (1, 0)],
 [(1, 0), (1, 1), (0, 0), (1, 0), (0, 0), (0, 0), (1, 0), (1, 0)],
 [(0, 0), (1, 1), (1, 1), (0, 0), (0, 0), (1, 0), (1, 0), (1, 0)],
 [(0, 0), (1, 1), (1, 1), (0, 0), (0, 0), (1, 0), (1, 0), (0, 0)],
 [(0, 0), (1, 0), (0, 1), (1, 0), (0, 0), (1, 0), (1, 0), (0, 0)]]

In [128]:
is_BR_history[-10:]

[[False, False, True, True, False, True, False, True],
 [False, False, True, True, True, True, False, True],
 [False, False, False, False, False, False, False, True],
 [False, True, False, False, True, False, False, True],
 [False, True, False, False, True, True, True, False],
 [True, False, False, True, False, False, True, True],
 [False, False, False, False, True, False, True, True],
 [False, False, False, True, True, True, True, False],
 [False, False, False, False, False, True, True, True],
 [False, True, False, False, True, False, False, False]]