In [1]:
import numpy as np
import random
import math

In [None]:
class DuelingBandits:
    def __init__(self, num_arms, win_matrix, best_arm, time_horizon=10^10):
        self.best_arm = best_arm
        self.total_pulls = 0
        self.history = []
        self.time_horizon = time_horizon
        self.regret = 0
        self.num_arms = num_arms
        self.win_matrix = win_matrix  # Win matrix. win_matrix[i][j] is the probability that arm i beats arm j
        self.arms = {i:{'history':[], 'num_pulls':0, 'num_wins':0} for i in range(num_arms)}  # Set of arms
    
    def pull_arms(self, arm_1, arm_2):
        """Simulate pulling the selected arm and receiving a reward."""
        # Updates Globals:
        self.total_pulls += 1

        # Simulate pulling the arms
        if (np.random.binomial(1, self.win_matrix[arm_1, arm_2]+0.5)):
            winning_arm = arm_1
            losing_arm = arm_2
        else: 
            winning_arm = arm_2
            losing_arm = arm_1
        
        # Updates the number of pulls for each arm
        self.arms[arm_1]['num_pulls'] += 1
        self.arms[arm_2]['num_pulls'] += 1

        # Updates the number of wins for each arm
        self.arms[winning_arm]['num_wins'] += 1
        self.arms[losing_arm]['num_wins'] += 0

        # Add to the history of each of the arms:
        self.arms[arm_1]['history'].append({"winner": winning_arm, "loser": losing_arm, "round": self.total_pulls})
        self.arms[arm_2]['history'].append({"winner": winning_arm, "loser": losing_arm, "round": self.total_pulls})

        # Add to the global history
        self.history.append({"winner": winning_arm, "loser": losing_arm, "round": self.total_pulls})
        
        return winning_arm
    

In [2]:
# e_i,j = 0.1 for all b_i > b_j (beat the mean example 1)
K = 5 # number of bandits
T = 10^10 # time horizon

Eps = np.zeros((K, K))
for i in range(K):
    for j in range(i+1, K):
        Eps[i, j] = -0.1
        Eps[j, i] = 0.1

In [3]:
def round(b_1, b_2, eps):
    # returns the result for draw b_1 vs. b_2
    return np.random.binomial(1, eps[b_1, b_2]+0.5)

In [4]:
res = np.empty((1000, 1))
for i in range(1000):
    res[i] = round(1, 1, Eps)

print(sum(res))

[506.]


In [5]:
import pdb
def IF1(T, K, Eps):
    # runs interleaved filter 1 over K dueling-bandits with time-horizon T
    # win probability matrix Eps
    wins = np.zeros((K, K))
    P_hat = np.ones((K, K))*1/2
    n_rounds = np.zeros((K, K))
    c_hat = np.empty((K, K))
    delta = 1/(T*K^2)
    T_hat = 0 # total comparisons made

    def update(b_1, b_2, res):
        n_rounds[b_1, b_2] = n_rounds[b_1, b_2] + 1
        n_rounds[b_2, b_1] = n_rounds[b_2, b_1] + 1
        wins[b_1, b_2] += res
        wins[b_2, b_1] += (1-res)
        P_hat[b_1, b_2] = wins[b_1, b_2]/n_rounds[b_1, b_2]
        P_hat[b_2, b_1] = wins[b_2, b_1]/n_rounds[b_2, b_1]
        c_hat[b_1, b_2] = math.sqrt(math.log(1/delta)/n_rounds[b_1, b_2])
        c_hat[b_2, b_1] = math.sqrt(math.log(1/delta)/n_rounds[b_2, b_1])

    W = np.arange(0, K)
    b_hat = np.random.choice(K)
    W = np.array(W[~np.isin(W, b_hat)])
    while len(W) > 0:
        print(W)
        for b in W:
            res = round(b_hat, b, Eps)
            T_hat += 1
            update(b_hat, b, res)
        b_to_remove = np.where(np.logical_and(P_hat[b_hat, :] > 1/2, 
            (P_hat[b_hat, :] - c_hat[b_hat, :]) > 1/2))
        W = np.array(W[~np.isin(W, b_to_remove)])
        b_win = np.where(np.logical_and(P_hat[b_hat, :] < 1/2, 
            (P_hat[b_hat, :] + c_hat[b_hat, :]) < 1/2))[0]
        if len(b_win) > 0:
            b_hat = b_win[np.random.choice(len(b_win))]
            W = np.array(W[(~(np.isin(W, b_hat)))])
            wins = np.zeros((K, K))
            P_hat = np.ones((K, K))*1/2
            n_rounds = np.zeros((K, K))
            c_hat = np.empty((K, K))
    
    return (b_hat, T_hat)



In [6]:
print(IF1(T, K, Eps))

[0 1 2 4]
[0 1 2 4]
[0 1 2 4]
[0 2]
[0 2]
[0 2]
(4, 18)
