In [2]:
import numpy as np

In [48]:
class Non_stationary_environment():
  def __init__(self, bid, ec, en, conversion_funcs, N, C,min_phase_duration= 10, max_phase_duration=20):
    self.min_phase_duration=min_phase_duration
    self.max_phase_duration=max_phase_duration
    self.bid= bid
    self.ec=ec
    self.en=en
    self.conversion_funcs= conversion_funcs
    self.t_end_phase=np.random.randint(min_phase_duration, max_phase_duration)
    self.actual_phase=0
    self.num_phases=conversion_funcs.shape[0]
    self.t=0
    self.N=N
    self.C=C

  def perform_day(self, price: float):
        """
        :param price: the price selected for the day
        :param t: the number of the day from the start
        :return: (num_clicks,num_conversions,adv_cost)
        num_clicks: int - the number of clicks
        num_conversions: int - the number of conversions
        adv_cost: float - the advertising costs
        """
        num_clicks = int(self.N(self.bid) + self.en())

        conversion_samples = np.random.binomial(n=1, p=self.conversion_funcs[self.actual_phase](price), size=num_clicks)
        num_conversions = np.sum(conversion_samples)

        adv_cost = self.C(self.bid) + self.ec()

        if(self.t==self.t_end_phase):
          self.actual_phase+=1
          self.t_end_phase+=np.random.randint(self.min_phase_duration, self.max_phase_duration)

        self.t+=1
        return num_clicks, num_conversions, adv_cost


In [46]:
from numpy.core.multiarray import ndarray
class NonStationaryClassEnvironmentHistory:
    """
    History of all the steps performed by an environment
    Observe that it is not stored inside the environment itself, but by the learner
    """

    def __init__(self, N, conversion_funcs, best_prices: ndarray, bid):
        """
        Parameters
        ----------
        N : callable
            N:bid->E[number of clicks] must be applicable to arrays
        C : callable
            C:bid->E[payment for clicks] must be applicable to arrays
        conversion_funcs : array of functions, one conversion for each phase
            conversion_funcs[phase](p) = E[conversion rate at price p at phase "phase"]
        best_price : float
            the price (among the available ones) that maximizes the reward

        Returns
        -------
        None.

        """
        self.N = N
        self.conversion_funcs = conversion_funcs
        self.best_prices = best_prices
        self.chosen_prices_per_round = []
        self.ns = []
        self.qs = []
        self.cs = []
        self.phase_per_round=[]
        self.bid=bid

    def add_step(self, p: float, n: int, q: int, phase: int):
        """
        Memorizes a new step (i.e., day) that has been performed

        Parameters
        ----------
        x : float
            the chosen bid
        p : float
            the chosen price
        n : int
            the number of clicks achieved
        q : int
            the number of conversions achieved
        phase: int
            the phase between the 5 that we can have

        Returns
        -------
        None.

        """
        self.chosen_prices_per_round.append(p)
        self.ns.append(n)
        self.qs.append(q)
        self.phase_per_round.append(phase)

    def reward_stats(self):
        """
        These stats are computed with the expected rewards
        (without noise)

        Returns
        -------
        instantaneous_rewards : numpy array
            the instantaneous rewards
        instantaneous_regrets : numpy array
            the instantaneous regrets
        cumulative_rewards : numpy array
            the cumulative rewards
        cumulative_regrets : numpy array
            the cumulative regrets

        """
        chosen_prices_per_round  = np.array(self.chosen_prices_per_round )
        return self.compute_reward_stats(chosen_prices_per_round, self.N, self.best_prices, self.conversion_funcs, self.phase_per_round, self.bid)

    @staticmethod
    def compute_reward_stats(chosen_prices_per_round : np.ndarray,N: callable,
                            best_price: float,conversion_funcs: np.ndarray, phase_per_round: np.ndarray, bid):
        """
        :return:
        instantaneous_rewards : np.ndarray
            the instantaneous rewards for each time step
        instantaneous_regrets : np.ndarray
            the instantaneous regrets for each time step
        cumulative_rewards : np.ndarray
            the cumulative rewards for each time step
        cumulative_regrets : np.ndarray
            the cumulative regrets for each time step
        """

        alphas=np.zeros(len(chosen_prices_per_round))
        for i in range(0,len(chosen_prices_per_round )):
          alphas[i] = conversion_funcs[phase_per_round[i]](chosen_prices_per_round[i])

        # here maybe I should use the actual number of conversions and advertising costs with the noise?
        instantaneous_rewards = alphas * chosen_prices_per_round

        best_reward=np.zeros(len(chosen_prices_per_round))
        for i in range(0,len(chosen_prices_per_round )):
            best_reward[i] = conversion_funcs[phase_per_round[i]](best_price[phase_per_round[i]]) * best_price[phase_per_round[i]]

        for phase in range(len(conversion_funcs)):
          print(str(phase)+". "+str(best_price[phase]))
        instantaneous_regrets = best_reward - instantaneous_rewards

        for round in range(len(phase_per_round)):
          print(str(round)+"-> phase: "+str(phase_per_round[round])+", arm chosen: "+ str(chosen_prices_per_round[round]))

        return instantaneous_rewards, instantaneous_regrets, np.cumsum(instantaneous_rewards), np.cumsum(instantaneous_regrets)

    def played_rounds(self):
        return len(self.chosen_prices_per_round)



In [34]:
def find_best_price_per_phase(prices, estimated_alphas):
  """
  prices: np.array
    all the prices(arms) which can be chosen
  estimated_alphas: np.array
    the conversion rate estimation for each phase possible

  return
    best_price_per_phase: np.array that contains the best possible price for each phase
  """
  best_price_per_phase=np.array([])
  for alphas_phase in estimated_alphas:
    earnings = alphas_phase * prices
    i = np.argmax(earnings)
    best_price_per_phase=np.append(best_price_per_phase,prices[i])
  return best_price_per_phase

In [5]:
class Exp3Learner():
  def __init__(self, environment: Non_stationary_environment, bid: np.ndarray, prices: np.ndarray):
     self.env = environment
     self.prices = prices
     self.bid = bid
     self._prepare_history()
     self.estimator = BeExp3Estimator(self.prices)

  def play_and_save(self, price):
        n, q, c = self.env.perform_day(price)
        self.history.add_step(price, n, q, self.env.actual_phase)
        return n, q, c



  def play_round(self):
    if self.history.played_rounds() < self.prices.shape[0]:
      p_t = self.prices[self.history.played_rounds()]
      p_t_ind = self.history.played_rounds()
    else:
      p_t, p_t_ind = self.estimator.provide_arm()

    n, q, c = self.play_and_save(p_t)
    self.estimator.update_estimations(p_t_ind, q/n)


  def _prepare_history(self):
    alphas_est = np.array([[self.env.conversion_funcs[i](p) for p in self.prices ] for i in range(self.env.num_phases)])
    n_est = self.env.N(self.bid)

    #c_est = self.env.C(self.xs)
    best_prices = find_best_price_per_phase(self.prices, alphas_est)
    self.history = NonStationaryClassEnvironmentHistory(self.env.N, self.env.conversion_funcs, best_prices, self.bid)




In [8]:
import math
class BeExp3Estimator():
  def __init__(self, prices, gamma=0.0):
    self.gamma=gamma
    self.weights = [1.0] * len(prices)
    self.arm_prices=prices
    self.probability_distribution=self.distr()

  def distr(self):
    weight_sum = float(sum(self.weights))
    return tuple((1.0 - self.gamma) * (w / weight_sum) + (self.gamma / len(self.weights)) for w in self.weights)

  def provide_arm(self):
    drawn_price = np.random.choice(self.arm_prices, size=1, p=self.probability_distribution, replace=False)[0]
    price_idx = np.where(self.arm_prices == drawn_price)[0]
    return drawn_price, int(price_idx)


  def update_estimations(self,price_idx, reward):

    estimatedReward = float(reward*self.arm_prices[price_idx] / self.probability_distribution[price_idx])
    self.weights[price_idx] = self.weights[price_idx] * math.exp(estimatedReward * self.gamma / self.arm_prices.shape[0])
    self.probability_distribution = self.distr()

Main

In [52]:
bid=10
prices=np.array([5,10,20,30,40,50])

C= lambda x:x**1/2
N=lambda x: x**2
ec= lambda: np.random.normal(0, 1)
conv_funcs = np.array([lambda x: 1/np.sqrt(x),
                       lambda x: 1/np.log(x+1),
                       lambda x: np.e**(-x),
                       lambda x: 1/(1+np.e**(-x)),
                       lambda x:  1 / (1 + x**2)])



env=Non_stationary_environment(bid, ec,ec,conv_funcs, N,C)
learner=Exp3Learner(env, bid, prices)

n_rounds=50
for i in range(0,n_rounds):
  learner.play_round()

instantaneous_rewards, instantaneous_regrets, cumsum_instantaneous_rewards, cum_sum_instantaneous_regrets=learner.history.reward_stats()
#print("cumulative regret: "+str(cum_sum_instantaneous_regrets))

0. 50.0
1. 50.0
2. 5.0
3. 50.0
4. 5.0
0-> phase: 0, arm chosen: 5
1-> phase: 0, arm chosen: 10
2-> phase: 0, arm chosen: 20
3-> phase: 0, arm chosen: 30
4-> phase: 0, arm chosen: 40
5-> phase: 0, arm chosen: 50
6-> phase: 0, arm chosen: 10
7-> phase: 0, arm chosen: 40
8-> phase: 0, arm chosen: 20
9-> phase: 0, arm chosen: 5
10-> phase: 1, arm chosen: 10
11-> phase: 1, arm chosen: 30
12-> phase: 1, arm chosen: 40
13-> phase: 1, arm chosen: 10
14-> phase: 1, arm chosen: 10
15-> phase: 1, arm chosen: 50
16-> phase: 1, arm chosen: 40
17-> phase: 1, arm chosen: 40
18-> phase: 1, arm chosen: 40
19-> phase: 1, arm chosen: 40
20-> phase: 1, arm chosen: 30
21-> phase: 1, arm chosen: 5
22-> phase: 1, arm chosen: 40
23-> phase: 1, arm chosen: 40
24-> phase: 1, arm chosen: 50
25-> phase: 2, arm chosen: 50
26-> phase: 2, arm chosen: 50
27-> phase: 2, arm chosen: 5
28-> phase: 2, arm chosen: 50
29-> phase: 2, arm chosen: 5
30-> phase: 2, arm chosen: 10
31-> phase: 2, arm chosen: 50
32-> phase: 2, ar

Carlos code

In [None]:
import math
import sys
import numpy as np
import numpy.random as random
import matplotlib as mpl

mpl.use('pgf')
from numpy.random import choice
from matplotlib import pyplot as plt


# Utility functions for managing matrices and probability distributions
def column(A, j):
    return [A[i][j] for i in range(len(A))]


def transpose(A):
    return [column(A, j) for j in range(len(A[0]))]


def distr(weights, gamma=0.0):
    weight_sum = float(sum(weights))
    return tuple((1.0 - gamma) * (w / weight_sum) + (gamma / len(weights)) for w in weights)


def draw(probability_distribution, arms):
    arm = choice(arms, size=1, p=probability_distribution, replace=False)[0]
    return arm


# Implementation of vanilla Exp3
def exp3(prices, rewards, gamma, rewardMin=0, rewardMax=1):
    weights = [1.0] * numActions
    arms = np.array([i for i in range(numActions)])
    t = 0
    while True:
        probabilityDistribution = distr(weights, gamma)
        arm = draw(probabilityDistribution, arms)
        # Pull arm
        reward = rewards(arm, t)
        # Scaled to rewards between 0 and 1
        normalizedReward = (reward - rewardMin) / (rewardMax - rewardMin)
        estimatedReward = float(normalizedReward / probabilityDistribution[arm])
        # We update the weight of the chosen arm
        weights[arm] = weights[arm] * math.exp(estimatedReward * gamma / numActions)
        yield arm, reward, estimatedReward, weights
        t = t + 1


def runExp3Example():
    numActions = 10
    numRounds = 100000
    rewardVector = []
    with open("../rewards.txt", "r") as file:
        for line in file:
            line = line.strip().replace("[", "").replace("]", "")
            if line:
                row = [int(num) for num in line.split(",")]
                rewardVector.append(row)

    rewards = lambda arm, t: rewardVector[t][arm]
    cumulativeRewards = [sum([rewardVector[t][arm] for t in range(numRounds)]) for arm in range(numActions)]
    bestArm = max(range(numActions), key=lambda action: sum([rewardVector[t][action] for t in range(numRounds)]))

    # Exact value of Gmax
    gMax = cumulativeRewards[bestArm]

    # Optimal theoretical gamma
    gamma = math.sqrt(numActions * math.log(numActions) / ((math.e - 1) * gMax))

    cumulativeReward = 0
    bestArmCumulativeReward = 0

    # Upper bound of expected regret at each round
    regretUpperBound = (math.e - 1) * gamma * gMax + numActions * math.log(numActions) / gamma

    with open('exp3results.txt', 'w') as f:
        original_stdout = sys.stdout
        sys.stdout = f
        t = 0
        for (arm, reward, est, weights) in exp3(numActions, rewards, gamma):
            cumulativeReward += reward
            bestArmCumulativeReward += rewardVector[t][bestArm]

            weakRegret = (bestArmCumulativeReward - cumulativeReward)
            regretBound = 2 * math.sqrt(math.e - 1) * \
                          math.sqrt(bestArmCumulativeReward * numActions * math.log(numActions))
            print("regret: %d\tmaxRegret: %.2f\tweights: (%s)" % (
                weakRegret, regretBound, ', '.join(["%.3f" % weight for weight in distr(weights)])))
            t = t + 1
            if t >= numRounds:
                break
    sys.stdout = original_stdout
    print("Cumulative reward: ", cumulativeReward)
    print("Best arm reward: ", bestArmCumulativeReward)
    print("Regret:", cumulativeRewards[bestArm] - cumulativeReward)
    print("Regret upper bound:", regretUpperBound)
    print("Gamma: ", gamma)

def regretWeightsGraph(filename, title):
    with open(filename, 'r') as infile:
        lines = infile.readlines()
    lines = [[eval(x.split(": ")[1]) for x in line.split('\t')] for line in lines]
    data = transpose(lines)
    regret = np.array(data[0])
    regretBound = np.array(data[1])
    weights = np.array(transpose(data[2]))

    # Number of rounds
    xs = np.array(range(len(data[0])))
    ax1 = plt.subplot(211)
    plt.ylabel('Cumulative (weak) Regret')
    ax1.plot(xs, regret, label="Regret ")
    ax1.plot(xs, regretBound, label="Regret bound")
    plt.legend(loc="upper left")
    plt.title(title)

    ax2 = plt.subplot(212)
    plt.ylabel('Weight')

    for w in weights:
        ax2.plot(xs, w)
    # plt.show()
    # plt.legend(loc="upper left")
    plt.savefig('exp3optimal.png', dpi=200)
    # plt.savefig('exp3optimal.pgf', format='pgf')


if __name__ == '__main__':
    runExp3Example()
    regretWeightsGraph("exp3results.txt", "Exp3")