In [1]:
import itertools
import numpy as np

In [2]:
n_states = 4 # number of states
n_agents = 3 # number of agents

n_Us = [2] * n_agents # number of actions per agent (for now all agents have just two actions)
Us = [range(n_Ui) for n_Ui in n_Us] # action spaces

In [3]:
# policy_space = itertools.product(Ui, repeat=n_states)
# policy[state] is a deterministic policy giving the action taken at state

In [4]:
i = 0 # temp
Qi_0 = np.zeros(shape=(n_states, n_Us[i]))
# Qi_t[state, action] is the Q-factor for taking action at state

In [5]:
def get_random_policy(action_space, n_states):
    return tuple(np.random.choice(action_space, size=n_states))

In [6]:
def get_initial_state():
    # initial state distribution
    return np.random.choice(range(n_states)) # uniform over states for now 

In [7]:
def random_policy(action_space, state):
    '''uniformly choose from `action_space`'''
    return np.random.choice(action_space)

In [8]:
def randomize_action(policy, state, action_space, rho):
    '''follow random policy w/ prob `rho`; otherwise follow given policy'''
    
    if np.random.random() < rho:
        return random_policy(action_space, state)
    else:
        return policy[state]

In [9]:
def create_cost_func(cost_matrix, i):
    '''create a cost function : X, U -> cost from `cost_matrix`'''
    
    def cost_func(x, us):
        u = us[i]
        return cost_matrix[x,u]

    return cost_func

In [10]:
# initialize cost functions
# note: here the cost functions don't depend on other agent's actions. 
C_matrices = [np.zeros((n_states, n_Us[0])), np.ones((n_states, n_Us[1])), np.random.random((n_states, n_Us[2]))]

C_funcs = [create_cost_func(C_matrix, i) for i, C_matrix in enumerate(C_matrices)]

# C_funcs[i](x, u) gives the cost function of agent i if action u is taken at state x

In [49]:
# temp: parameters
experimentation_prob = 0.1 # probability of experimentation at each action
inertias = [0.05] * n_agents # inertias of each agent (\lambda)
betas = [0.9]*n_agents # discount factor
n_exploration_phases = 100 # number of exploration phases
T = 1_000 # length of exploration phase
deltas = [0.05]*n_agents

def transition_state(x, us):
    '''returns next state given current state and action'''
    
    # for now, just cycle through states according to action of first agent
    u = us[0]
    return (x + u) % n_states

In [50]:
def update_Qi(Qi, x_t, ui_t, cost_i, beta_i, alpha_i_n):
    Qi_new = Qi
    Qi_new[x_t, ui_t] = (1-alpha_i_n) * Qi[x_t, ui_t] + alpha_i_n * (cost_i + beta_i*np.min(Qi[x_t, :]))
    
    return Qi_new

In [51]:
def calc_alpha(n):
    return 1/n

In [52]:
def is_best_reply(Qi, policy, delta_i):
    '''estimate whether a policy is approximately a best-response to the learned Q-factor'''
    
    policy_value = Qi[range(len(policy)), policy]
    opt_value = np.min(Qi, axis = 1)
    
    return np.all(policy_value <= opt_value + delta_i)

In [53]:
#def q_learning_alg1():

# initialize Q_space? NOTE: did not account for compactness of space
# initialize sequence {T_k}: length of exploration phase k
# initialize exploration prob \rho
# initialize inertia \lambda
# initialize tolerance level for sub-optimality
# initialize \alpha^i_n sequence of step sizes

# initialize policies for each agent (agent_policies[agent] gives agent's policy)
agent_policies = [get_random_policy(Ui, n_states) for Ui in Us]

# initialize Q-factors for each agent (Qs[agent] gives agent's Q-factor)
Qs = [np.zeros(shape=(n_states, n_Ui)) for n_Ui in n_Us]

x_0 = get_initial_state()
x_t = x_0

t = 0

# iterate over exploration phases
for k in range(n_exploration_phases):
    
    print(k)


    # initialize n_ts number of visits to (x,u^i) in kth exploration phase up to t
    # n_ts[i][x,u] gives the number of visits to (x, u^i) for agent i
    n_ts = [np.zeros(shape=(n_states, n_Us[i])) for i in range(n_agents)]

    # iterate over time in exploration phase k
    for t in range(t, t + T):
        #print(f'k={k}; t={t}')

        # choose actions
        actions_t = [randomize_action(agent_policies[i], x_t, Us[i], experimentation_prob) for i in range(n_agents)]
        #print(f'actions: {actions_t}')

        # receive costs
        costs = [C_funcs[i](x_t, actions_t) for i in range(n_agents)]
        #print(f'costs: {costs}')

        # recive next state
        next_state = transition_state(x_t, actions_t)
        #print(f'next state: {next_state}')

        # update n_ts number of visits to (x_t, u^i_t) in kth exploration phase up to t
        for i in range(n_agents):
            ui_t = actions_t[i]
            n_ts[i][x_t, ui_t] += 1

        # update Q-factors
        for i in range(n_agents):

            ui_t = actions_t[i]
            cost_i = costs[i]
            beta_i = betas[i]

            alpha_i_n = calc_alpha(n_ts[i][x_t, ui_t])

            Qs[i] = update_Qi(Qs[i], x_t, ui_t, cost_i, beta_i, alpha_i_n)



        # update x_t
        x_t = next_state

     # calculate estimate of best reply policy space
    for i in range(n_agents):
        full_policy_space_i = itertools.product(Us[i], repeat=n_states)
        br_policy_space_i = [policy for policy in full_policy_space_i if is_best_reply(Qs[i], policy, deltas[i])]

        # if agent i's policy is not a best response replace it with a best response
        if agent_policies[i] not in br_policy_space_i:
            # with inertia, don't replace policy even if it's not a best response
            if np.random.random() < 1 - inertias[i]:
                agent_policies[i] = br_policy_space_i[np.random.choice(len(br_policy_space_i))]

    t+=1 # increment to start on next t next exploration phase
    #print()


#print(Qs)
# reset Q-factors to anything in Q_space. (perhapse project) [NOTE: i'm ignoring this for now]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [54]:
Qs

[array([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]]),
 array([[10., 10.],
        [10., 10.],
        [10., 10.],
        [10., 10.]]),
 array([[0.68231688, 0.89458779],
        [6.46263277, 6.23610682],
        [3.30249794, 2.85761639],
        [7.20390153, 7.08968998]])]

In [48]:
Qs

[array([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]]),
 array([[10., 10.],
        [10., 10.],
        [10., 10.],
        [10., 10.]]),
 array([[0.68231688, 0.89458779],
        [6.46263277, 6.23610682],
        [3.30249794, 2.85761639],
        [7.20390153, 7.08968998]])]