In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy import optimize
from matplotlib import cm
from math import isinf
from scipy.optimize import linprog

In [None]:
import numpy as np
from scipy import stats

class MultiProductPricingEnvironment:
    def __init__(self, costs, prices, valuation_sampler):
        """
        costs : array-like, shape (M,)
            Unit cost for each product type.
        prices : array-like, shape (M, K)
            Grid of K prices for each of M product types.
        valuation_sampler : callable
            A function that returns an array of shape (n_customers, M)
            giving the valuations for each product type jointly.
            This can encode correlations.
        """
        self.costs = np.array(costs)
        self.prices = np.array(prices)
        self.M = len(costs)  # number of product types
        self.valuation_sampler = valuation_sampler

    def round(self, price_indices, n_t):
        """
        price_indices : array-like of length M
            Chosen price index for each product type.
        n_t : int
            Number of customers this round.
        Returns:
            d_t : array of shape (M,)  # number of units sold per product
            r_t : float               # total reward (profit) this round
        """
        # Sample joint valuations for all customers
        valuations = self.valuation_sampler(n_t)  # shape: (n_t, M)

        # Get the actual prices for chosen price indices
        chosen_prices = np.array([
            self.prices[m, price_indices[m]] for m in range(self.M)
        ])

        # Determine purchases: valuation >= price
        purchases = valuations >= chosen_prices  # boolean matrix (n_t, M)
        d_t = purchases.sum(axis=0)              # units sold per product
        r_t = np.sum((chosen_prices - self.costs) * d_t)  # total profit

        return d_t, r_t


In [None]:
import numpy as np
from scipy import optimize

class CombinatorialUCB:
    def __init__(self, M, K, B, T, exploration_scale=1.0):
        """
        M : int
            Number of product types
        K : int
            Number of discrete price levels per product
        B : float
            Total budget (inventory cost limit)
        T : int
            Horizon (total number of rounds)
        exploration_scale : float
            Multiplier for UCB exploration term
        """
        self.M = M
        self.K = K
        self.B = B
        self.T = T
        self.exploration_scale = exploration_scale

        self.t = 0
        self.budget_remaining = B

        # Track averages and pulls
        self.avg_reward = np.zeros((M, K))
        self.avg_cost = np.zeros((M, K))
        self.num_pulls = np.zeros((M, K))

    def select_prices(self):
        if self.budget_remaining <= 0:
            return [-1] * self.M  # No prices (out of budget)

        if self.t < self.M * self.K:
            # Initial exploration: go through all arms
            prod = self.t // self.K
            price_idx = self.t % self.K
            choice = [np.random.randint(0, self.K) for _ in range(self.M)]
            choice[prod] = price_idx
            return choice

        # UCB estimates
        exploration = self.exploration_scale * np.sqrt(
            (2 * np.log(self.T)) / np.maximum(1, self.num_pulls)
        )
        reward_ucb = self.avg_reward + exploration
        cost_lcb = self.avg_cost - exploration

        # Solve LP to choose price indices under budget
        return self._solve_optimization(reward_ucb, cost_lcb)

    def _solve_optimization(self, reward_ucb, cost_lcb):
        """
        reward_ucb: MxK matrix of UCB reward estimates
        cost_lcb:   MxK matrix of LCB cost estimates
        """
        # Flatten into MK arms
        rewards = reward_ucb.flatten()
        costs = cost_lcb.flatten()

        # Variables: binary choice for each arm
        n_vars = self.M * self.K
        c = -rewards  # maximize reward → minimize negative

        # Constraints: 
        # 1) Total cost within per-round budget allowance
        rho = self.B / self.T
        A_ub = [costs]
        b_ub = [rho]

        # 2) Exactly one price per product
        A_eq = []
        b_eq = []
        for m in range(self.M):
            row = np.zeros(n_vars)
            row[m * self.K:(m + 1) * self.K] = 1
            A_eq.append(row)
            b_eq.append(1.0)

        bounds = [(0, 1)] * n_vars
        res = optimize.linprog(c, A_ub=A_ub, b_ub=b_ub,
                               A_eq=A_eq, b_eq=b_eq,
                               bounds=bounds, method='highs')

        if res.success:
            x = res.x.reshape(self.M, self.K)
            price_indices = [int(np.argmax(x[m])) for m in range(self.M)]
            return price_indices
        else:
            return [np.random.randint(0, self.K) for _ in range(self.M)]

    def update(self, price_indices, rewards, costs):
        """
        rewards: float total reward this round
        costs: array-like per product
        """
        for m in range(self.M):
            idx = price_indices[m]
            self.num_pulls[m, idx] += 1
            self.avg_reward[m, idx] += (
                (rewards[m] - self.avg_reward[m, idx]) /
                self.num_pulls[m, idx]
            )
            self.avg_cost[m, idx] += (
                (costs[m] - self.avg_cost[m, idx]) /
                self.num_pulls[m, idx]
            )
        self.budget_remaining -= np.sum(costs)
        self.t += 1
