In [1]:
import numpy as np
from queue import Queue

In [2]:
# Encapsulates a vector of each arm
class Arms:
    # Creates a vector of N arms
    def __init__(self, N):
      self.N = N
      self.reward_weights = self._rand()
      self.bid_weights = self._rand()

      print("Average reward by arm:")
      print(self.reward_weights / 2, "\n")
    
      print("Average bid by arm:")
      print(self.bid_weights / 2, "\n")

    # Returns an np array of the reward for each arm
    def generate_rewards(self):
        return self._rand() * self.reward_weights

    # Returns an np array of the bid for each arm
    def generate_bids(self):
        return self._rand() * self.bid_weights
    
    def _rand(self):
        return np.random.rand(self.N) * .99 + 0.01

In [3]:
# Takes the average rewards vector, total visits vector, number of arms chosen per round, and current round
# and returns a vector of the upper confidence bounds
def compute_ucb_rewards(expected_rewards, visits, cooldown, K, t):
    ucf = ((K + 1) * np.log(t) / visits) ** 1/2
    ucb = expected_rewards + ucf
    available_ucb = ucb * cooldown
    return available_ucb

In [4]:
# Performs one round of selecting every arm to initialize the expected_rewards, paying the maximum cost 
def first_round(arms, N):
    visits = np.full(N, 1)
    costs = arms.generate_bids()
    cost = sum(costs)
    expected_rewards = arms.generate_rewards()
    cooldown = np.full(N, 1)
    return visits, expected_rewards, cost, cooldown

In [5]:
# Holds an auction, choosing the best K arms to explore depending on their bids during the action and
# depending on expected_rewards, visits, and the current round. Returns the ucb rewards of each arm, a new_visits
# one hot encoded vector for whether or not each arm is chosen, and the bids of each arm
def auction_results(arms, expected_rewards, visits, cooldown, K, t):
    bids = arms.generate_bids()
    ucb_rewards = compute_ucb_rewards(expected_rewards, visits, cooldown, K, t)
    score_to_index = sorted([(ucb_rewards[i], i) for i in range(len(ucb_rewards))], reverse=True)
    new_visits = np.zeros(len(ucb_rewards))
    for i in range(K):
        new_visits[score_to_index[i][1]] = 1
    return ucb_rewards, new_visits, bids, score_to_index[K][1]

In [6]:
# Calculates the cost of this round
def compute_cost(arms, ucb_rewards, new_visits, bids, best_unchosen_idx):
    best_unchosen_bid = bids[best_unchosen_idx]
    best_unchosen_ucb = ucb_rewards[best_unchosen_idx]
    
    # Zero out unchosen arms
    chosen_ucb = ucb_rewards * new_visits

    payments = chosen_ucb * best_unchosen_bid / best_unchosen_ucb
    payments = np.minimum(payments, np.array([1]))

    cost = sum(payments)
    return cost

In [7]:
# Returns a vector of the rewards of the chosen arms and 0 for unchosen arms
def get_rewards(arms, new_visits):
    rewards = arms.generate_rewards()
    
    # Zero out unchosen arms
    return rewards * new_visits

In [8]:
# Updates the average reward of each arm using the most recent rewards
def update_expectations(expected_rewards, new_rewards, visits):
    # If new_rewards[i] is zeroed out, replace the zero with the average, so it doesn't change the average
    new_rewards = np.array([new_rewards[i] if new_rewards[i] != 0
                                           else expected_rewards[i] for i in range(len(new_rewards))])

    expected_rewards = ((expected_rewards * visits) + new_rewards) / (visits + 1)
    return expected_rewards

In [9]:
# Performs one round. Returns vectors of the visits from that round, rewards for that round,
# and the float cost of the round
def round(arms, expected_rewards, visits, cooldown, K, t):
    ucb_rewards, new_visits, bids, best_unchosen = auction_results(arms, expected_rewards, visits, cooldown, K, t)
    new_rewards = get_rewards(arms, new_visits)
    new_cost = compute_cost(arms, ucb_rewards, new_visits, bids, best_unchosen)
    return new_visits, new_rewards, new_cost

In [10]:
# Updates the cooldowns for the next round, returning a vector containing for each arm a 1
# if that arm is available, 0 if that arm is on cooldown
def update_cooldown(cooldown_queue, cooldown, new_visits, N):
    for i in range(N):
        if new_visits[i] == 1:
            if cooldown_queue.full():
                ready = cooldown_queue.get()
                cooldown[ready] = 1
            cooldown_queue.put(i)
            cooldown[i] = 0
    return cooldown

In [11]:
# Runs the simulation
def AUCB(arms, N, K, B):
    cooldown_queue = Queue(maxsize=COOLDOWNTIMER)
    total_reward = 0
    visits, expected_rewards, cost, cooldown = first_round(arms, N)
    t = 1
    while True:
        new_visits, new_rewards, cost = round(arms, expected_rewards, visits, cooldown, K, t)
        
        if B < cost:
            return total_reward, expected_rewards

        cooldown = update_cooldown(cooldown_queue, cooldown, new_visits, N)
        expected_rewards = update_expectations(expected_rewards, new_rewards, visits)
        visits = visits + new_visits
        total_reward = total_reward + new_rewards
        B = B - cost
        t = t + 1

In [12]:
COOLDOWNTIMER = 5
N = 30  # Number of arms
K = 1  # Number of arms to choose per round
B = 1200  # Budget
arms = Arms(N)
reward_by_arm, expected_rewards = AUCB(arms, N, K, B)
total_reward = sum(reward_by_arm)

print("Total reward:")
print(total_reward, "\n")

print("Total reward gained from each arm:")
print(reward_by_arm, "\n")

print("Expected reward of each arm:")
print(expected_rewards)

Average reward by arm:
[0.32651249 0.19893439 0.11583326 0.02635703 0.01169328 0.40128286
 0.3201941  0.15283144 0.05229151 0.42692083 0.44357561 0.04712348
 0.34696753 0.34829345 0.2329581  0.24846068 0.34167092 0.4065
 0.29655711 0.04400164 0.39469137 0.38032819 0.48615409 0.03421361
 0.4445384  0.24200362 0.23041956 0.42750157 0.25340096 0.00867684] 

Average bid by arm:
[0.34715267 0.00864131 0.09430332 0.33554694 0.1463674  0.33694516
 0.27489336 0.02001804 0.32038167 0.2384153  0.34792449 0.45809519
 0.10585933 0.21202217 0.016764   0.48041595 0.40684215 0.25902905
 0.4438605  0.46834887 0.09287509 0.22309207 0.3580702  0.28363907
 0.07482802 0.20235008 0.07975426 0.21975258 0.30530666 0.0187977 ] 

Total reward:
1864.6413641018364 

Total reward gained from each arm:
[2.40226811e+01 7.97916567e+00 2.82107153e+00 5.42001061e-01
 2.52566205e-01 6.55998887e+01 2.61761619e+01 3.34510321e+00
 9.61365065e-01 1.47241085e+02 2.60574118e+02 8.27153117e-01
 4.84631104e+01 5.44445335e+01 1