## Simulations from AY'17 Paper

In [1]:
import itertools
import numpy as np
from tqdm import tqdm

### Simulation parameters

In [2]:
def get_initial_state():
    # initial state distribution
    return np.random.choice(range(n_states)) # uniform over states for now 

In [3]:
def create_cost_func(cost_matrix, i):
    '''create a cost function : X, U -> cost from `cost_matrix`'''
    # TODO: fix. cost depends only on 
    
    def cost_func(x, us):
        u = us[i]
        return cost_matrix[x,u]

    return cost_func

In [4]:
a, b, c = -1, 2, 1
reward_matrix = np.array([[c, a],[b, 0]])

def reward0(state, actions):
    ui, u_i = actions
    return reward_matrix[ui, u_i]
def reward1(state, actions):
    u_i, ui = actions
    return reward_matrix[ui, u_i]

reward_funcs = [reward0, reward1]

In [5]:
def transition_state(x, us):
    '''returns next state given current state and action'''
    gamma = 0.3

    if us == [0,0]:
        if np.random.random() < 1 - gamma:
            return 0
        else:
            return 1
    else:
        if np.random.random() < 1 - gamma:
            return 1
        else:
            return 0

In [28]:
r = 0.751 # 0.51
def calc_alpha(n):
    return 1/(n**r)

In [95]:
n_states = 2 # number of states
n_agents = 2 # number of agents

n_Us = [2] * n_agents # number of actions per agent (both agents have two actions)

experimentation_probs = [0.1]*n_agents # probability of experimentation at each action (\rho)
inertias = [0.5] * n_agents # inertias of each agent (\lambda)
betas = [0.9]*n_agents # discount factor
deltas = [0]*n_agents # (in paper it's 0) [this is critical; does T depend on this?]
n_exploration_phases = 100 # number of exploration phases
T = 100 # length of exploration phase

### Run Simulation

In [96]:
from multi_agent_learning import q_learning_alg1

In [97]:
Qs, agent_policies, (policy_history, Qs_history, is_BR_history) = q_learning_alg1(n_Us, n_states, reward_funcs, betas,
                                                                    get_initial_state, transition_state, 
                                                                    n_exploration_phases, T, experimentation_probs,
                                                                    calc_alpha, deltas, inertias)

100%|██████████████████████████| 100/100 [00:00<00:00, 169.11it/s]


In [98]:
Qs

[array([[0.04036487, 0.96075697],
        [0.46607843, 1.00253599]]),
 array([[-0.46195971,  0.65680714],
        [-0.31045334,  0.6038834 ]])]

In [99]:
agent_policies

[(1, 1), (1, 1)]

In [100]:
policy_history[:10]

[[(1, 0), (1, 1)],
 [(1, 0), (1, 1)],
 [(1, 0), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)]]

In [94]:
np.average(np.all(is_BR_history, axis=1))

0.95

In [None]:
# NOTE: 
# behavior roughly the same in terms of `fraction of time joint policy (mu1(k),mu2(k)) is at an MPE` for both sets of code
# slight deviation from trial to trial depending on initial random joint policy
# at K=100, T=10,000, rho=0.01, fraction of time joint policy (mu1(k),mu2(k)) is at an MPE was 0.98 for both sets of code
# at K=100, T=1,000, rho=0.01, fraction of time joint policy (mu1(k),mu2(k)) is at an MPE was 0.97 for both sets of code
# at K=100, T=1,000, rho=0.1, fraction of time joint policy (mu1(k),mu2(k)) is at an MPE was 0.96 for both sets of code