## Simulations from AY'17 Paper

In [1]:
import itertools
import numpy as np
from tqdm import tqdm

### Simulation parameters

In [2]:
def get_initial_state():
    # initial state distribution
    return np.random.choice(range(n_states)) # uniform over states for now 

In [3]:
def create_cost_func(cost_matrix, i):
    '''create a cost function : X, U -> cost from `cost_matrix`'''
    # TODO: fix. cost depends only on 
    
    def cost_func(x, us):
        u = us[i]
        return cost_matrix[x,u]

    return cost_func

In [4]:
a, b, c = -1, 2, 1
cost_matrix = np.array([[c, a],[b, 0]])

def cost0(state, actions):
    ui, u_i = actions
    return cost_matrix[ui, u_i]
def cost1(state, actions):
    u_i, ui = actions
    return cost_matrix[ui, u_i]

cost_funcs = [cost0, cost1]

In [5]:
def transition_state(x, us):
    '''returns next state given current state and action'''
    gamma = 0.3

    if us == [0,0]:
        if np.random.random() < 1 - gamma:
            return 0
        else:
            return 1
    else:
        if np.random.random() < 1 - gamma:
            return 1
        else:
            return 0

In [6]:
r = 0.51
def calc_alpha(n):
    return 1/(n**r)

In [7]:
n_states = 2 # number of states
n_agents = 2 # number of agents

n_Us = [2] * n_agents # number of actions per agent (both agents have two actions)

experimentation_probs = [0.01]*n_agents # probability of experimentation at each action (\rho)
inertias = [0.5] * n_agents # inertias of each agent (\lambda)
betas = [0.9]*n_agents # discount factor
deltas = [0]*n_agents # (in paper it's 0) [this is critical; does T depend on this?]
n_exploration_phases = 50 # number of exploration phases
T = 1000 # length of exploration phase

### Run Simulation

In [8]:
from decentralised_qlearning import q_learning_alg1

In [9]:
Qs, agent_policies, (policy_history, Qs_history, is_BR_history) = q_learning_alg1(n_Us, n_states, cost_funcs, betas,
                                                                    get_initial_state, transition_state, 
                                                                    n_exploration_phases, T, experimentation_probs,
                                                                    calc_alpha, deltas, inertias)

100%|█████████████████████████████| 50/50 [00:03<00:00, 12.84it/s]


In [10]:
Qs

[array([[-0.86669585,  0.175414  ],
        [-0.83996794,  0.17929958]]),
 array([[-0.90351572,  0.14103227],
        [-0.88649428,  0.16798276]])]

In [11]:
agent_policies

[(1, 1), (1, 1)]

In [12]:
np.average(np.all(is_BR_history, axis=1))

0.96

In [14]:
policy_history

[[(0, 0), (0, 0)],
 [(1, 1), (0, 0)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)],
 [(1, 1), (1, 1)]]