In [None]:
import numpy as np
from random import choices

# Fictitious play

In [None]:
class Agent:
    def __init__(self, actions, rewards, probs=None):
        self.best_response = False
        self.actions = self.create_actions(actions, probs)
        self.rewards = rewards
    
    @staticmethod
    def create_actions(actions, probs):
        if probs is None:
            return {a: 1/len(actions) for a in actions}
        if len(actions) != len(probs):
            raise RuntimeError("Actions and probabilities should have the same length")
        if sum(probs) != 1:
            raise ValueError("Probabilities should add up to 1")
        return {a: p for a, p in zip(actions, probs)}
    
    def set_opp_count(self):
        self.opp_count = {a: 0 for a in self.rewards[self.select_action()].keys()}
        
    def record_opponent_action(self, opp_action):
        if opp_action not in self.opp_count.keys():
            raise KeyError("Opponent action different from the list of possible actions")
        self.opp_count[opp_action] += 1
    
    def select_action(self):
        if self.best_response:
            return self.select_best_response()
        return choices(list(self.actions.keys()), weights=self.actions.values())[0]
    
    def select_best_response(self):
        s = sum(self.opp_count.values())
        utilities = {a: sum([self.rewards[a][a2] * c / s for a2, c in self.opp_count.items()])
                     for a in self.actions.keys()}
        return max(utilities, key=utilities.get)


class Game:
    def __init__(self, agents, n_initial_games=1, total_games=10000):
        self.n_initial_games = n_initial_games
        self.total_games = total_games
        self.agents = agents
    
    def play_game(self):
        a1, a2 = self.agents
        a1.record_opponent_action(a2.select_action())
        a2.record_opponent_action(a1.select_action())
    
    def play(self):
        for agent in self.agents:
            agent.set_opp_count()
            agent.best_response = False
        # Play some initial games randomly to accrue some data
        for _ in range(self.n_initial_games):
            self.play_game()
        # Switch to playing best responses
        for agent in self.agents:
            agent.best_response = True
        for _ in range(self.total_games - self.n_initial_games):
            self.play_game()
    
    def summary(self):
        print(f"Player 1 plays the actions {tuple(game.agents[1].opp_count.keys())} with probabilities " +\
              f"{tuple(round(c / self.total_games, 3) for c in game.agents[1].opp_count.values())}")
        print(f"Player 2 plays the actions {tuple(game.agents[0].opp_count.keys())} with probabilities " +\
              f"{tuple(round(c / self.total_games, 3) for c in game.agents[0].opp_count.values())}")

In [None]:
A1, A2 = ['A', 'B', 'C'], ['W', 'X', 'Y', 'Z']
rewards = np.asarray([
    [[1,5], [2,2], [3,4], [3,1]],
    [[3,0], [4,1], [2,5], [4,2]],
    [[1,3], [2,6], [5,2], [2,3]]
])
R1 = {a1: {a2: r[0] for a2, r in zip(A2, R)} for a1, R in zip(A1, rewards)}
R2 = {a1: {a2: r[1] for a2, r in zip(A1, R)} for a1, R in zip(A2, rewards.swapaxes(0, 1))}
game = Game([Agent(A1, R1), Agent(A2, R2)])

In [None]:
game.play()
game.summary()

# Monte Carlo sampling

In [None]:
size = 10000
sample = np.cos(np.random.randn(size))**2
print(f"Expected value:\t{sample.mean():.3}\nVariance:\t{sample.var():.3}")

# 99.9% confidence level is 3.291
print(f"The 99.9% confidence interval is {sample.mean():.3f}±{3.291*sample.std()/np.sqrt(size):.3f}")

# Thompson sampling

In [None]:
import matplotlib.pyplot as plt
from random import uniform

class KArmedBandit:
    def __init__(self, k=1, probs=None):
        self.arms = [uniform(0, 1) for _ in range(k)] if probs is None else [p for p in probs]
    
    def __call__(self, arm):
        return self.pull(arm)
    
    def __getitem__(self, i):
        return self.arms[i]
    
    def __repr__(self):
        name = f"{len(self.arms)}-Armed Bandit with:\n"
        for i, p in enumerate(self.arms):
            name += f"  p{i+1}={p:.3f}\n"
        return name[:-1]
    
    def pull(self, arm):
        if type(arm) != int or len(self.arms) <= arm or arm < 0:
            raise IndexError("Invalid arm selected")
        return int(self.arms[arm] > uniform(0, 1))

In [None]:
kab = KArmedBandit(probs=[0.2, 0.5, 0.9])
samples = 1200
plt.figure(figsize=(16, 6))
plt.tight_layout()

for arm in range(3):
    rewards = np.asarray([kab(arm) for _ in range(samples)])
    a, b = rewards.cumsum(), (1-rewards).cumsum()
    mean = a/(a+b)
    var = np.sqrt(a*b/((a+b)**2*(a+b+1)))

    plt.plot(mean, label=f"Arm with p={kab[arm]}")
    plt.fill_between(range(samples), mean + var, mean - var, alpha=0.6)
plt.hlines(kab.arms, 0, samples, linestyle="--", colors='dimgray')
plt.legend()
plt.yticks([0, 0.4, 0.6, 0.8, 1] + kab.arms)
plt.title("Convergence to true probability", fontsize=18)
plt.ylabel("Probability p of getting reward r", fontsize=14)
plt.xlabel("Iteration", fontsize=14)
# plt.savefig("convergence.png")
plt.show()

In [None]:
k = 3
kab = KArmedBandit(k=k)
params = np.asarray([[1, 1] for _ in range(k)])
print(kab)

regret = np.max(kab.arms) - np.asarray(kab.arms)
arms = list()

iterations = 1000 * k
for i in range(iterations):
    arm = np.argmax([np.random.beta(*ab) for ab in params])
    r = kab(int(arm))
    params[arm] += [r, 1-r]
    arms.append(arm)
    
plt.figure(figsize=(16, 6))
plt.tight_layout()
plt.plot(regret[arms].cumsum(), label="Thompson sampling")

for c, style in zip(np.logspace(-3, 0, 4), [(0, (1, 10)), '--', ':', '-.']):
    counts, rewards = [0] * k, [0] * k
    arms = list()
    for i in range(iterations):
        arm = np.argmax([np.infty if count == 0 else r/count + c*np.sqrt(np.log(i)/count)
                         for r, count in zip(rewards, counts)])
        r = kab(int(arm))
        counts[arm] += 1
        rewards[arm] += r
        arms.append(arm)
    plt.plot(regret[arms].cumsum(), label=f"UCB with c={c:.0e}", linestyle=style)
plt.legend()
plt.title("Thompson sampling vs UCB", fontsize=18)
plt.ylabel("Regret", fontsize=14)
plt.xlabel("Iteration", fontsize=14)
# plt.savefig("thompson_ucb.png")
plt.show()