# Testing the Quantized Decentralized Multi-Agent Q-Learning Algorithm

In [1]:
import itertools
import numpy as np
from tqdm.auto import tqdm, trange

from quantization import DiscreteSpace, ContinuousInterval, UniformQuantizer, NullQuantizer, QuantizedPolicy, TransitionKernel
from cts_multi_agent_learning import quantized_q_learning_alg

## Trivial Single-Agent 'Finite' State and Action
A simple MDP with a single state and two actions, one good one bad.

In [2]:
state_space = DiscreteSpace([0]) # single state
action_space = DiscreteSpace([0, 1]) # two actions

state_quantizer = NullQuantizer(state_space)
action_quantizer = NullQuantizer(action_space)

q_policy = QuantizedPolicy(state_quantizer, action_quantizer, index_policy='random_init', exploration_prob=0.5)
quantized_agent_policies = [q_policy]

def transition_func(x, us):
    return 0

transition_kernel = TransitionKernel(state_space, [action_space], transition_func)

def reward_func(x, us):
    u = us[0]
    if u==0:
        return 0
    elif u==1:
        return 1
    else:
        raise ValueError('received action outside expected range')

reward_funcs = [reward_func]

betas = [0.5]

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 1

deltas = [1e-4]

inertias = [0.5]

get_initial_state = lambda: 0


In [3]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [4]:
Qs

[array([[0.99177106, 1.99175108]])]

In [5]:
quantized_agent_policies[0].get_policy_map()

{0: 1}

## Slightly Less Trivial Single-Agent 'Finite' State and Action Spaces

An MDP with a 'good' state and a 'bad' state, and two actions each of which causes a deterministic transition to one of the states.

In [6]:
state_space = DiscreteSpace(['good', 'bad']) # two state
action_space = DiscreteSpace(['a', 'b']) # two actions

state_quantizer = NullQuantizer(state_space)
action_quantizer = NullQuantizer(action_space)

q_policy = QuantizedPolicy(state_quantizer, action_quantizer, index_policy='random_init', exploration_prob=0.5)
quantized_agent_policies = [q_policy]

def transition_func(x, us):
    u = us[0]
    if u == 'a':
        return 'good'
    elif u=='b':
        return 'bad'
    else:
        raise ValueError('received unexpected action')

transition_kernel = TransitionKernel(state_space, [action_space], transition_func)

def reward_func(x, us):
    if x=='good':
        return 1
    elif x=='bad':
        return 0
    else:
        raise ValueError('received state/action outside expected range')

reward_funcs = [reward_func]

betas = [0.25]

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 1

deltas = [1e-4]

inertias = [0.5]

get_initial_state = lambda: 'good'

In [7]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [8]:
Qs

[array([[1.33296191, 1.08295976],
        [0.33297202, 0.08295468]])]

In [9]:
quantized_agent_policies[0].get_policy_map()

{'good': 'a', 'bad': 'a'}

## 2-agent Team (coordinate actions), 'Finite' State and action space

A 2-agent generalization of the above where transition to the good state occurs of the agents coordinate (play the same action) and transition to the bad state occurs otherwise.

In [10]:
state_space = DiscreteSpace(['good', 'bad']) # two state
action_space1 = DiscreteSpace(['a', 'b']) # two actions
action_space2 = DiscreteSpace(['a', 'b']) # two actions

state_quantizer1 = NullQuantizer(state_space)
state_quantizer2 = NullQuantizer(state_space)

action_quantizer1 = NullQuantizer(action_space1)
action_quantizer2 = NullQuantizer(action_space2)

q_policy1 = QuantizedPolicy(state_quantizer1, action_quantizer1, index_policy='random_init', exploration_prob=0.25)
q_policy2 = QuantizedPolicy(state_quantizer2, action_quantizer2, index_policy='random_init', exploration_prob=0.25)

quantized_agent_policies = [q_policy1, q_policy2]

def transition_func(x, us):
    u1, u2 = us
    
    if u1 == u2:
        return 'good'
    elif u1 != u2:
        return 'bad'
    else:
        raise ValueError('received unexpected action')

transition_kernel = TransitionKernel(state_space, [action_space1, action_space2], transition_func)

def reward_func(x, us):
    if x=='good':
        return 1
    elif x=='bad':
        return 0
    else:
        raise ValueError('received state/action outside expected range')

reward_funcs = [reward_func, reward_func]

betas = [0.25]*2

T = int(5e4)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 1

deltas = [1e-4]*2

inertias = [0.5]*2

get_initial_state = lambda: 'good'

In [11]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))




In [12]:
Qs

[array([[1.0513329 , 1.31955168],
        [0.16692366, 0.18329821]]),
 array([[1.28893766, 1.10327584],
        [0.10187907, 0.29034376]])]

In [13]:
quantized_agent_policies[0].get_policy_map(), quantized_agent_policies[1].get_policy_map()

({'good': 'a', 'bad': 'b'}, {'good': 'b', 'bad': 'b'})

## Continuous State-Space Single-Agent
A continuous space MDP over $\mathbb{X} = [0,1]$ and $\mathbb{U} = \{-1, 1\}$ where actions either transition the state forward or backward. A larger state value corresponds to higher reward.

In [14]:
state_space = ContinuousInterval(0, 1) # single state
action_space = DiscreteSpace([-1, 1]) # two actions

state_quantizer = UniformQuantizer(state_space, n_bins=10)
action_quantizer = NullQuantizer(action_space)

q_policy = QuantizedPolicy(state_quantizer, action_quantizer, index_policy='random_init', exploration_prob=0.5)
quantized_agent_policies = [q_policy]

def transition_func(x, us):
    u = us[0]
    x_next = np.clip(x + 0.1*u, 0, 1)
    return x_next

transition_kernel = TransitionKernel(state_space, [action_space], transition_func)

def reward_func(x, us):
    return x # reward high state values

reward_funcs = [reward_func]

betas = [0.25]

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 1

deltas = [1e-4]

inertias = [0.5]

get_initial_state = lambda: 0

In [15]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [16]:
Qs

[array([[0.0108065 , 0.04384476],
        [0.11078582, 0.17665118],
        [0.24383492, 0.30907713],
        [0.37664552, 0.4412434 ],
        [0.53842975, 0.57598689],
        [0.76172575, 0.81857168],
        [0.9027839 , 0.97159355],
        [1.04097226, 1.09899312],
        [1.17097353, 1.21966586],
        [1.29793467, 1.31495625]])]

In [17]:
quantized_agent_policies[0].get_policy_map()

{0.0: 1,
 0.1111111111111111: 1,
 0.2222222222222222: -1,
 0.3333333333333333: -1,
 0.4444444444444444: -1,
 0.5555555555555556: 1,
 0.6666666666666666: 1,
 0.7777777777777777: -1,
 0.8888888888888888: -1,
 1.0: -1}

## Continuous State-Space Single-Agent

Similar to the above except the cost measures the distance from $x = 0.5$.

In [18]:
state_space = ContinuousInterval(0, 1) # single state
action_space = DiscreteSpace([-1, 1]) # two actions

state_quantizer = UniformQuantizer(state_space, n_bins=10)
action_quantizer = NullQuantizer(action_space)

q_policy = QuantizedPolicy(state_quantizer, action_quantizer, index_policy='random_init', exploration_prob=0.5)
quantized_agent_policies = [q_policy]

def transition_func(x, us):
    u = us[0]
    x_next = np.clip(x + 0.1*u, 0, 1)
    return x_next

transition_kernel = TransitionKernel(state_space, [action_space], transition_func)

def reward_func(x, us):
    return - np.abs(x - 0.5) # reward state values near 0.5

reward_funcs = [reward_func]

betas = [0.25]

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 1

deltas = [1e-4]

inertias = [0.5]

get_initial_state = lambda: 0

In [19]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [20]:
Qs

[array([[-0.64939519, -0.61987103],
        [-0.55111295, -0.48794686],
        [-0.42086477, -0.35599115],
        [-0.28781956, -0.22744549],
        [-0.13746208, -0.11201001],
        [-0.10267158, -0.12619157],
        [-0.22518406, -0.28814459],
        [-0.35583935, -0.42146178],
        [-0.48838033, -0.55489541],
        [-0.62154145, -0.654924  ]])]

In [21]:
quantized_agent_policies[0].get_policy_map()

{0.0: 1,
 0.1111111111111111: 1,
 0.2222222222222222: 1,
 0.3333333333333333: 1,
 0.4444444444444444: 1,
 0.5555555555555556: -1,
 0.6666666666666666: -1,
 0.7777777777777777: -1,
 0.8888888888888888: -1,
 1.0: -1}

## Continuous State-Space Two-Agent

A 2-agent Generalization of the above where coordination causes moving forward in the state space and a lack of coordination causes moving backwards.

In [22]:
state_space = ContinuousInterval(0, 1) # single state
action_space1 = DiscreteSpace([-1, 1]) # two actions
action_space2 = DiscreteSpace([-1, 1]) # two actions

state_quantizer1 = UniformQuantizer(state_space, n_bins=10)
state_quantizer2 = UniformQuantizer(state_space, n_bins=10)

action_quantizer1 = NullQuantizer(action_space1)
action_quantizer2 = NullQuantizer(action_space2)

q_policy1 = QuantizedPolicy(state_quantizer1, action_quantizer1, index_policy='random_init', exploration_prob=0.25)
q_policy2 = QuantizedPolicy(state_quantizer2, action_quantizer2, index_policy='random_init', exploration_prob=0.25)

quantized_agent_policies = [q_policy1, q_policy2]

def transition_func(x, us):
    u1, u2 = us
    d = 1 if u1==u2 else -1 # go forward if coordinating, backwards otherwise
    x_next = np.clip(x + 0.1*d, 0, 1)
    return x_next

transition_kernel = TransitionKernel(state_space, [action_space1, action_space2], transition_func)

def reward_func(x, us):
    return - np.abs(x - 0.5) # reward state values near 0.5

reward_funcs = [reward_func, reward_func]

betas = [0.25]*2

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 1

deltas = [1e-4]*2

inertias = [0.5]*2

get_initial_state = lambda: 0

In [23]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [24]:
Qs

[array([[-0.60529981, -0.48286549],
        [-0.5018332 , -0.39595302],
        [-0.39702736, -0.35418681],
        [-0.2869505 , -0.21852211],
        [-0.07369052, -0.08097699],
        [-0.07930672, -0.08747975],
        [-0.17215772, -0.25154934],
        [-0.34313679, -0.37399459],
        [-0.35810128, -0.51415424],
        [-0.58903872, -0.64107224]]),
 array([[-0.62740738, -0.65035598],
        [-0.49869668, -0.54715235],
        [-0.36451296, -0.41522599],
        [-0.22623865, -0.28128899],
        [-0.08957105, -0.07488306],
        [-0.10114662, -0.08103516],
        [-0.22815452, -0.2814276 ],
        [-0.41423768, -0.36430974],
        [-0.49759442, -0.54713628],
        [-0.65038067, -0.62673511]])]

In [25]:
quantized_agent_policies[0].get_policy_map(), quantized_agent_policies[1].get_policy_map()

({0.0: 1,
  0.1111111111111111: 1,
  0.2222222222222222: 1,
  0.3333333333333333: 1,
  0.4444444444444444: -1,
  0.5555555555555556: -1,
  0.6666666666666666: -1,
  0.7777777777777777: -1,
  0.8888888888888888: -1,
  1.0: -1},
 {0.0: -1,
  0.1111111111111111: -1,
  0.2222222222222222: -1,
  0.3333333333333333: -1,
  0.4444444444444444: 1,
  0.5555555555555556: 1,
  0.6666666666666666: -1,
  0.7777777777777777: 1,
  0.8888888888888888: -1,
  1.0: 1})