In [52]:
from run_utils import run_episode, get_prior
from env import AssortmentEnvironment
from base_agents import RandomAgent

In [53]:
N = 10
K = 3
T = 1000
M = 25

In [39]:
agent = RandomAgent(
    k=K,
    n=N,
)

# Actual experiments with printing
experiment_data = []
run_preferences = get_prior(
    n_items=N,
    prior_type="uniform",
    fallback_weight=None,
)
env = AssortmentEnvironment(n=N, v=run_preferences)

obs_run, rewards_run = run_episode(
    envnmt=env, actor=agent, n_steps=T, verbose=False
)

100%|██████████| 1000/1000 [00:00<00:00, 6908.54it/s]


In [5]:
def action_statistics(observations, n_items):
    action_to_id = {}
    count = 0
    probas = []
    zero_given_x = []
    for action, observation in observations:
        action = tuple(sorted(list(action)))
        if action in action_to_id:
            ix = action_to_id[action]
            probas[ix] += 1.
            if observation == n_items:
                zero_given_x[ix] += 1.
        else:
            action_to_id[action] = count
            probas.append(1.)
            zero_given_x.append(1. if observation == n_items else 0)
            count += 1
    probas = np.array(probas)
    zero_given_x = np.array(zero_given_x)
    zero_given_x = zero_given_x / probas
    probas /= len(observations)
    assert probas.shape[0] == count
    assert zero_given_x.shape[0] == count
    assert probas.sum() < (1 + 1e-5)
    assert probas.sum() > (1 - 1e-5)
    id_to_action = {ix: action for (action, ix) in action_to_id.items()}
    return id_to_action, probas, zero_given_x 

def f_function(t, p0):
    t = np.clip(t, 0., 1.)
    return xlogy(t, t/p0) + xlogy(1-t, (1-t) / (1 - p0))

def query_improvement(idx, data, p_i, p_i_c, p0):
    s_left = data.prev_item(idx)[0]
    s_right = data.succ_item(idx)[0]
    p = p_i[idx] - p_i[s_left]
    q = p_i[s_right] - p_i[idx]
    alpha = (p_i[idx] * p_i_c[idx] - p_i[s_left] * p_i_c[s_left]) / p
    beta = (p_i[s_right] * p_i_c[s_right] - p_i[idx] * p_i_c[idx]) / q
    improv = f_function(t=alpha, p0=p0) * p + f_function(t=beta, p0=p0) * q - (p + q) * f_function(t=(alpha * p + beta * q)/(p+q), p0=p0)
    if np.isnan(improv):
        ipdb.set_trace()
    return f_function(t=alpha, p0=p0) * p + f_function(t=beta, p0=p0) * q - (p + q) * f_function(t=(alpha * p + beta * q)/(p+q), p0=p0)

In [20]:
def observations_to_actions(obs_run, N, M)
    id_to_action, x_probas, zero_given_x = action_statistics(observations=obs_run, n_items=N)

    p_0 = len([x for (x, it) in obs_run if it == N]) / len(obs_run)
    sorting_indexes = np.argsort(zero_given_x)
    id_to_actions = [id_to_action[ix] for ix in sorting_indexes]
    zero_given_x = zero_given_x[sorting_indexes]
    x_probas = x_probas[sorting_indexes]

    n_actions = zero_given_x.shape[0]
    n_per_set = 2 * (n_actions // M)
    S = FastRBTree()
    S.insert(0, 0)
    S.insert(n_actions, n_actions)
    p_inf = np.zeros(n_actions + 1)
    p_inf_c = np.zeros(n_actions + 1)
    for ix in range(1, n_actions + 1):
        p_inf[ix] = p_inf[ix - 1] + x_probas[ix - 1]
        p_inf_c[ix] = (p_inf[ix - 1] * p_inf_c[ix - 1] + zero_given_x[ix - 1] * x_probas[ix - 1])/ p_inf[ix]

    indexes_available = np.ones(n_actions+1, dtype=bool)
    indexes_available[n_actions] = False
    indexes_available[0] = False
    indexes = np.arange(n_actions+1, dtype=int)
    for step_idx in range(M - 1):
        largest_imp = -1
        index_added = None
        random_indexes = np.random.choice(indexes[indexes_available], size=n_per_set, replace=False)
        for index_candidate in random_indexes:
            S.insert(index_candidate, index_candidate)
            improvement = query_improvement(idx=index_candidate, data=S, p_i=p_inf, p_i_c=p_inf_c, p0=p_0)
            if improvement > largest_imp:
                largest_imp = improvement
                index_added = index_candidate
            S.remove(index_candidate)
        print(f"improvement {largest_imp}, with {index_added} at step {step_idx + 1}")
        indexes_available[index_added] = False
        S.insert(index_added, index_added)

    return np.vstack([np.array(id_to_actions[key-1]) for key in sorted(S.keys())[1:]])

improvement 0.04467871109800698, with 73 at step 1
improvement 0.016400526794599805, with 20 at step 2
improvement 0.005512109055157619, with 10 at step 3
improvement 0.003524423995922738, with 46 at step 4
improvement 0.005632010192837138, with 108 at step 5
improvement 0.0014865349616899007, with 86 at step 6
improvement 0.00036801816057700776, with 110 at step 7
improvement 0.0006083477571946494, with 116 at step 8
improvement 0.00046508350708036883, with 32 at step 9
improvement 0.00027770551866536375, with 94 at step 10
improvement 0.000151706728243421, with 67 at step 11
improvement 8.503307250205792e-05, with 57 at step 12
improvement 7.764816847561024e-05, with 40 at step 13
improvement 9.099093607269374e-05, with 85 at step 14
improvement 0.000110445702559624, with 14 at step 15
improvement 8.943934229299622e-05, with 118 at step 16
improvement 9.748245205930522e-05, with 114 at step 17
improvement 4.4795212243480854e-05, with 24 at step 18
improvement 4.0836956207354006e-05, 

In [55]:
agent = RandomAgent(
    k=K,
    n=N,
)
run_preferences = get_prior(
    n_items=N,
    prior_type="uniform",
    fallback_weight=None,
)
env = AssortmentEnvironment(n=N, v=run_preferences)
obs_run, rewards_run = run_episode(
    envnmt=env, actor=agent, n_steps=T, verbose=False
)
limited_actions = observations_to_actions(obs_run=obs_run, n_items=N, m_actions=M)

100%|██████████| 1000/1000 [00:00<00:00, 8493.00it/s]

improvement 0.04595445082928895, with 61 at step 1
improvement 0.012423296614438532, with 27 at step 2
improvement 0.006852594800065864, with 98 at step 3
improvement 0.0011362536151441598, with 76 at step 4
improvement 0.006620476453818559, with 6 at step 5
improvement 0.0010759474518450517, with 109 at step 6
improvement 0.0019738412427992916, with 16 at step 7
improvement 0.000589289135951891, with 53 at step 8
improvement 0.0002816402504926298, with 39 at step 9
improvement 0.0002081651937341951, with 9 at step 10
improvement 0.00020848477789806204, with 119 at step 11
improvement 0.00017594637816593407, with 100 at step 12
improvement 0.0001321651722482962, with 68 at step 13
improvement 9.798437083376882e-05, with 90 at step 14
improvement 6.607682552202138e-05, with 42 at step 15
improvement 2.1077548090748448e-05, with 73 at step 16
improvement 9.279260230032373e-05, with 117 at step 17
improvement 3.496054859009339e-05, with 47 at step 18
improvement 1.0885903623567873e-05, wi




In [57]:
len(set(tuple(limited_actions[i, :]) for i in range(limited_actions.shape[0]))) == M

True