In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy import optimize 
import sys
sys.path.append('../utils')

from UCB1_constrained import UCBLikeAgent 



# Environment Setup

In [None]:
class StochasticPricingEnvironment:
    """
    Stochastic environment for dynamic pricing with customer valuation uncertainty.
    
    This environment simulates a single-product pricing scenario where:
    - Customers have valuations drawn from a known distribution
    - The probability of purchase depends on whether customer valuation >= price
    - Each round represents one customer interaction
    """
    def __init__(self, valuation_distribution):
        """
        Initialize the pricing environment.
        
        Args:
            valuation_distribution: A scipy.stats distribution representing customer valuations
        """
        self.valuation_dist = valuation_distribution
        
    def demand_probability(self, price):
        """
        Calculate the theoretical probability that a customer purchases at given price.
        
        This is the complement of the CDF: P(valuation >= price) = 1 - F(price)
        where F is the cumulative distribution function of customer valuations.
        
        Args:
            price: The price to evaluate
            
        Returns:
            Probability that a randomly drawn customer will purchase at this price
        """
        return 1 - self.valuation_dist.cdf(price)

    def simulate_round(self, price):
        """
        Simulate one customer interaction at the given price.
        
        Args:
            price: The price offered to the customer
            
        Returns:
            tuple: (sale_made, revenue) where:
                - sale_made: 1 if customer purchased, 0 otherwise
                - revenue: price if sale was made, 0 otherwise
        """
        # Draw a random customer valuation from the distribution
        valuation = self.valuation_dist.rvs()
        
        # Customer purchases if their valuation >= price
        sale_made = 1 if valuation >= price else 0
        
        # Revenue is price if sale was made, 0 otherwise
        revenue = sale_made * price
        
        return sale_made, revenue

# Theoretical Optimal


In [None]:
#define the optimal solution
def compute_clairvoyant(prices, environment, T, P):
    """
    Compute the optimal (clairvoyant) pricing strategy with full information.
    
    This function solves the linear program that an oracle with perfect knowledge
    of the demand probabilities would solve:
    
    maximize: sum_i gamma_i * price_i * demand_prob_i
    subject to: sum_i gamma_i * demand_prob_i <= P/T  (inventory constraint)
               sum_i gamma_i = 1                      (probability constraint)
               gamma_i >= 0                          (non-negativity)
    
    Args:
        prices: List of available prices
        environment: StochasticPricingEnvironment to get true demand probabilities
        T: Time horizon
        P: Total inventory
        
    Returns:
        tuple: (optimal_distribution, optimal_value, demand_probabilities, expected_revenues)
    """
    # Compute true demand probabilities for each price
    buying_probabilities = np.array([environment.demand_probability(p) for p in prices])
    
    # Expected revenue per selection for each price
    exp_reward= prices * buying_probabilities
    
    # Set up linear program (convert maximization to minimization)
    c = -exp_reward  # Negate for minimization
    
    # Inventory constraint: expected consumption rate <= inventory rate
    A_ub = [buying_probabilities]
    b_ub = [P / T]
    
    # Probability constraint: sum of probabilities = 1
    A_eq = [np.ones(len(prices))]
    b_eq = [1]
    
    # Solve the linear program
    res = optimize.linprog(c, A_ub=A_ub, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq,method='highs', 
                          bounds=[(0, 1) for _ in range(len(prices))])
    
    gamma = res.x  # Optimal price distribution
    optimal_value = -(res.fun)  # Optimal expected revenue per round
    
    return gamma, optimal_value, buying_probabilities, exp_reward

# Simulation

In [None]:
def run_simulator(
    T,
    valuation_dist,
    env_config,
    agent_params,
    n_simulations=1,
    verbose=True
):
    selected_prices = []
    revenues = []
    sales = []
    cumulative_revenue = []
    total_revenue = 0
    best_prices = []
    first_inventory_empty = None

    # === Compute theoretical optimum once before all simulations ===
    env_theoretical = StochasticPricingEnvironment(
        valuation_distribution=valuation_dist
    )
    opt_dist, opt_value, true_purchase_probs, true_rewards = compute_clairvoyant(
        prices=agent_params['prices'],
        environment=env_theoretical,
        T=agent_params['T'],
        P=agent_params['P']
    )
    optimal_idx = np.argmax(opt_dist)
    #optimal_revenue = np.dot(agent_params['prices'], opt_dist)
    #optimal_revenue = np.dot(true_rewards, opt_dist)
    optimal_revenue=opt_value
    for sim in range(n_simulations):
        
        print(f"\n=== Running UCB1 Pricing Simulation #{sim + 1} for {T} rounds ===")

        # Create environment and agent
        env = StochasticPricingEnvironment(
            valuation_distribution=valuation_dist
        )
        agent = UCBLikeAgent(**agent_params)
        #agent = ThompsonSamplingAgent(P =agent_params['P'], T=T, prices=agent_params['prices'], rho_penalty=agent_params['rho_penalty'])

        for t in range(T):
            if agent.remaining_inventory < 1 and first_inventory_empty is None:
                first_inventory_empty = t
                print(f"Inventory empty for the first time at round {t}")
                print("No more products in the inventory")

            price = agent.select_price()
            price_idx = agent.current_price_idx

            if agent.remaining_inventory <= 0:
                sale_made = False
                revenue = 0
            else:
                sale_made, revenue = env.simulate_round(price)

            agent.update(revenue, sale_made)

            selected_prices.append(price)
            revenues.append(revenue)
            sales.append(sale_made)
            total_revenue += revenue
            cumulative_revenue.append(total_revenue)
                    # Print progress occasionally
            if verbose and (t + 1) % (T // 10) == 0:
                remaining_inventory = np.sum(agent.remaining_inventory)
                print(f"Round {t + 1:4d}: Revenue = {revenue:6.2f}, "
                    f"Cumulative = {total_revenue:8.2f}, "
                    f"Remaining inventory = {remaining_inventory:.0f}")



        best_price, best_avg_revenue = agent.get_best_price()
        best_prices.append(best_price)

        print(f"\nSimulation completed!")
        print(f"Total revenue: {total_revenue:.2f}")
        print(f"Average revenue per round: {total_revenue / T:.2f}")
        print(f"Agent's best price: {best_price} (avg revenue: {best_avg_revenue:.2f})")
        print(f"Theoretical optimal: {opt_dist} (expected revenue: {optimal_revenue:.2f})")

    price_to_idx = {p: i for i, p in enumerate(agent_params['prices'])}
    price_indices = [price_to_idx[p] for p in selected_prices if not np.isnan(p)]
    price_counts = np.bincount(price_indices, minlength=len(agent_params['prices']))        

    if verbose:
        print("\n=== Simulation Results ===")
        print("\nPrice selection frequency:")
        for i, (price, count) in enumerate(zip(agent_params['prices'], price_counts)):
            percentage = 100 * count / T
            marker = " ← OPTIMAL" if i == optimal_idx else ""
            print(f"  Price {price:2}: {count:4d} times ({percentage:5.1f}%){marker}")

    return {
        'price_counts': price_counts,
        'selected_prices': selected_prices,
        'revenues': revenues,
        'sales': sales,
        'cumulative_revenue': cumulative_revenue,
        'best_prices': best_prices,
        'first_inventory_empty': first_inventory_empty,
        'total_revenue': total_revenue,
        'opt_dist': opt_dist,
        'optimal_idx': optimal_idx,
        'optimal_revenue': optimal_revenue,
        'agent': agent  # Add the final agent state
    }


# Plot the results

In [None]:
T=10000

# Low Budget 20%

In [None]:
budget = 0.2

In [None]:
# Environment Configuration
env_config = {
    'valuation_mean': 0.5,  # Average customer valuation
    'valuation_std': 0.1,   # Standard deviation of customer valuations
    'demand_noise_std': 0.005  # Noise in demand probability
}

## Create a normal distribution for customer valuations
valuation_dist = stats.norm(loc=env_config['valuation_mean'], scale=env_config['valuation_std'])

#valuation_dist=stats.uniform(0,1)


inventory = T*budget # Inventory constraint

n_prices=4

# Calculate epsilon for exploration
epsilon = inventory**(-1/3)  # Epsilon for UCB exploration

agent_params = {
    'P': inventory,  # inventory constraint
    'T': T,  # number of rounds
    #'prices': [0.0,0.1,0.2, 0.3, 0.4, 0.5, 0.6,0.7,0.8,0.9,1],  # set of prices 
    'prices': np.linspace(0.1, 0.9, n_prices+2),  # set of prices 
    'confidence_bound': 1,  # UCB exploration parameter (reduced for better performance)
    'rho_penalty': 1,  # Penalty factor for inventory constraint (increased to allow more sales)
    'use_pen_rho': False  # Use rho penalty for inventory constraint
}

agent = UCBLikeAgent(**agent_params)

#print(f"Customer valuation distribution: Normal(μ={env_config['valuation_mean']}, σ={env_config['valuation_std']})")
print(f"Number of price options: {len(agent_params['prices'])}")
print(f"UCB confidence bound: {agent_params['confidence_bound']}")
print(f'Inventory contrain: ',agent_params['P'])
print(f'Number of rounds:',agent_params['T'])

# set random seed for reproducibility
np.random.seed(42)

results = run_simulator(
    T=T,
    valuation_dist=valuation_dist,
    env_config=env_config,
    agent_params=agent_params,
    n_simulations=1,
    verbose=True
)

selected_prices = results['selected_prices']
revenues = results['revenues']
sales = results['sales']
cumulative_revenue = results['cumulative_revenue']
best_prices = results['best_prices']
first_inventory_empty = results['first_inventory_empty']
total_revenue = results['total_revenue']
price_counts = results['price_counts']
agent = results['agent']  # Get the actual agent used in simulation

In [None]:
_, opt_value, _, _ = compute_clairvoyant(
    prices=agent_params['prices'],
    environment=StochasticPricingEnvironment(valuation_distribution=valuation_dist),
    T=agent_params['T'],
    P=agent_params['P']
)

baseline_reward_invetory = [opt_value * (t + 1) for t in range(T)]
regret_invetory = np.array(baseline_reward_invetory) - cumulative_revenue

t_vals = np.arange(1, T + 1)
theoretical_bound = np.sqrt(np.log(t_vals) * t_vals)
# Plot results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(cumulative_revenue, label="Cumulative Reward")
plt.plot(baseline_reward_invetory, label="Baseline Reward", linestyle="--")
t_values = np.arange(T)
plt.fill_between(t_values, baseline_reward_invetory-np.sqrt(t_values), baseline_reward_invetory+np.sqrt(t_values), color='gray', alpha=0.3, label="Regret area")
plt.xlabel("Round")
plt.ylabel("Reward")
plt.title(f"Reward Over Time (UCB1 Agent), Budget: {budget*100}%")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(regret_invetory, label="Regret", color='red')
plt.axhline(y=0, color='black', linestyle='--', linewidth=0.5)
plt.plot(theoretical_bound, label="Theoretical Regret Bound", linestyle="--")
plt.xlabel("Round")
plt.ylabel("Regret")
plt.title(f"Regret Over Time (UCB1 Agent), Budget: {budget*100}%")
plt.legend()
    
plt.tight_layout()
plt.show()

# Mid budget 50%

In [None]:
budget=0.5 

In [None]:
# Environment Configuration
env_config = {
    'valuation_mean': 0.5,  # Average customer valuation
    'valuation_std': 0.1,   # Standard deviation of customer valuations
    'demand_noise_std': 0.005  # Noise in demand probability
}

## Create a normal distribution for customer valuations
valuation_dist = stats.norm(loc=env_config['valuation_mean'], scale=env_config['valuation_std'])

#valuation_dist=stats.uniform(0,1)


inventory = T*budget # Inventory constraint

n_prices=4

# Calculate epsilon for exploration
epsilon = inventory**(-1/3)  # Epsilon for UCB exploration

agent_params = {
    'P': inventory,  # inventory constraint
    'T': T,  # number of rounds
    #'prices': [0.0,0.1,0.2, 0.3, 0.4, 0.5, 0.6,0.7,0.8,0.9,1],  # set of prices 
    'prices': np.linspace(0.1, 0.9, n_prices+2),  # set of prices 
    'confidence_bound': 1,  # UCB exploration parameter (reduced for better performance)
    'rho_penalty': 1,  # Penalty factor for inventory constraint (increased to allow more sales)
    'use_pen_rho': False  # Use rho penalty for inventory constraint
}

agent = UCBLikeAgent(**agent_params)

#print(f"Customer valuation distribution: Normal(μ={env_config['valuation_mean']}, σ={env_config['valuation_std']})")
print(f"Number of price options: {len(agent_params['prices'])}")
print(f"UCB confidence bound: {agent_params['confidence_bound']}")
print(f'Inventory contrain: ',agent_params['P'])
print(f'Number of rounds:',agent_params['T'])

# set random seed for reproducibility
np.random.seed(42)

results = run_simulator(
    T=T,
    valuation_dist=valuation_dist,
    env_config=env_config,
    agent_params=agent_params,
    n_simulations=1,
    verbose=True
)

selected_prices = results['selected_prices']
revenues = results['revenues']
sales = results['sales']
cumulative_revenue = results['cumulative_revenue']
best_prices = results['best_prices']
first_inventory_empty = results['first_inventory_empty']
total_revenue = results['total_revenue']
price_counts = results['price_counts']
agent = results['agent']  # Get the actual agent used in simulation

In [None]:
_, opt_value, _, _ = compute_clairvoyant(
    prices=agent_params['prices'],
    environment=StochasticPricingEnvironment(valuation_distribution=valuation_dist),
    T=agent_params['T'],
    P=agent_params['P']
)

baseline_reward_invetory = [opt_value * (t + 1) for t in range(T)]
regret_invetory = np.array(baseline_reward_invetory) - cumulative_revenue

t_vals = np.arange(1, T + 1)
theoretical_bound = np.sqrt(np.log(t_vals) * t_vals)
# Plot results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(cumulative_revenue, label="Cumulative Reward")
plt.plot(baseline_reward_invetory, label="Baseline Reward", linestyle="--")
t_values = np.arange(T)
plt.fill_between(t_values, baseline_reward_invetory-np.sqrt(t_values), baseline_reward_invetory+np.sqrt(t_values), color='gray', alpha=0.3, label="Regret area")
plt.xlabel("Round")
plt.ylabel("Reward")
plt.title(f"Reward Over Time (UCB1 Agent), Budget: {budget*100}%")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(regret_invetory, label="Regret", color='red')
plt.axhline(y=0, color='black', linestyle='--', linewidth=0.5)
plt.plot(theoretical_bound, label="Theoretical Regret Bound", linestyle="--")
plt.xlabel("Round")
plt.ylabel("Regret")
plt.title(f"Regret Over Time (UCB1 Agent), Budget: {budget*100}%")
plt.legend()

plt.tight_layout()
plt.show()

# High Budget 80%


In [None]:
budget=0.8

In [None]:
# Environment Configuration
env_config = {
    'valuation_mean': 0.5,  # Average customer valuation
    'valuation_std': 0.1,   # Standard deviation of customer valuations
    'demand_noise_std': 0.005  # Noise in demand probability
}

## Create a normal distribution for customer valuations
valuation_dist = stats.norm(loc=env_config['valuation_mean'], scale=env_config['valuation_std'])

#valuation_dist=stats.uniform(0,1)


inventory = T*budget # Inventory constraint

n_prices=4

# Calculate epsilon for exploration
epsilon = inventory**(-1/3)  # Epsilon for UCB exploration

agent_params = {
    'P': inventory,  # inventory constraint
    'T': T,  # number of rounds
    #'prices': [0.0,0.1,0.2, 0.3, 0.4, 0.5, 0.6,0.7,0.8,0.9,1],  # set of prices 
    'prices': np.linspace(0.1, 0.9, n_prices+2),  # set of prices 
    'confidence_bound': 1,  # UCB exploration parameter (reduced for better performance)
    'rho_penalty': 1,  # Penalty factor for inventory constraint (increased to allow more sales)
    'use_pen_rho': False  # Use rho penalty for inventory constraint
}

agent = UCBLikeAgent(**agent_params)

#print(f"Customer valuation distribution: Normal(μ={env_config['valuation_mean']}, σ={env_config['valuation_std']})")
print(f"Number of price options: {len(agent_params['prices'])}")
print(f"UCB confidence bound: {agent_params['confidence_bound']}")
print(f'Inventory contrain: ',agent_params['P'])
print(f'Number of rounds:',agent_params['T'])

# set random seed for reproducibility
np.random.seed(42)

results = run_simulator(
    T=T,
    valuation_dist=valuation_dist,
    env_config=env_config,
    agent_params=agent_params,
    n_simulations=1,
    verbose=True
)

selected_prices = results['selected_prices']
revenues = results['revenues']
sales = results['sales']
cumulative_revenue = results['cumulative_revenue']
best_prices = results['best_prices']
first_inventory_empty = results['first_inventory_empty']
total_revenue = results['total_revenue']
price_counts = results['price_counts']
agent = results['agent']  # Get the actual agent used in simulation

In [None]:
_, opt_value, _, _ = compute_clairvoyant(
    prices=agent_params['prices'],
    environment=StochasticPricingEnvironment(valuation_distribution=valuation_dist),
    T=agent_params['T'],
    P=agent_params['P']
)

baseline_reward_invetory = [opt_value * (t + 1) for t in range(T)]
regret_invetory = np.array(baseline_reward_invetory) - cumulative_revenue

t_vals = np.arange(1, T + 1)
theoretical_bound = np.sqrt(np.log(t_vals) * t_vals)
# Plot results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(cumulative_revenue, label="Cumulative Reward")
plt.plot(baseline_reward_invetory, label="Baseline Reward", linestyle="--")
t_values = np.arange(T)
plt.fill_between(t_values, baseline_reward_invetory-np.sqrt(t_values), baseline_reward_invetory+np.sqrt(t_values), color='gray', alpha=0.3, label="Regret area")
plt.xlabel("Round")
plt.ylabel("Reward")
plt.title(f"Reward Over Time (UCB1 Agent), Budget: {budget*100}%")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(regret_invetory, label="Regret", color='red')
plt.axhline(y=0, color='black', linestyle='--', linewidth=0.5)
plt.plot(theoretical_bound, label="Theoretical Regret Bound", linestyle="--")
plt.xlabel("Round")
plt.ylabel("Regret")
plt.title(f"Regret Over Time (UCB1 Agent), Budget: {budget*100}%")
plt.legend()

plt.tight_layout()
plt.show()