## Problem 1

In [1]:
import pandas as pd
import numpy as np
from scipy import optimize
import matplotlib.pyplot as plt

# Read the data
# data = pd.read_csv('/Users/helmadevina/Desktop/Pricing-Analytics-and-Revenue-Management/data.csv')
data = pd.read_csv("data.csv")
print(f"Data loaded, shape: {data.shape}")

# Group by search_id
search_groups = data.groupby('srch_id')

# Features to use in the MNL model
features = [
    'prop_starrating',
    'prop_review_score',
    'prop_brand_bool',
    'prop_location_score',
    'prop_accesibility_score',
    'prop_log_historical_price',
    'price_usd',
    'promotion_flag'
]

# Function to calculate MNL log-likelihood
def mnl_log_likelihood(beta, data, search_groups, features):
    # Initialize log-likelihood
    log_likelihood = 0
    
    # Extract beta values
    beta0 = beta[0]  # intercept
    beta_features = beta[1:].copy()  # coefficients for features
    
    # Get price index to use for scaling
    price_index = features.index('price_usd')
    
    # Loop through each search session
    for search_id, group in search_groups:
        # Get the hotels in this search
        hotels = group.copy()
        
        # Scale price for numerical stability (but don't modify the beta)
        X = hotels[features].copy()
        X['price_usd'] = X['price_usd'] / 100.0  # Scale price to prevent overflow
        
        # Calculate utilities
        utilities = beta0 + X.dot(beta_features)
        hotels['utility'] = utilities
        
        # Calculate exp(utility) with numerical stability
        max_utility = utilities.max()
        hotels['exp_utility'] = np.exp(utilities - max_utility)
        
        # Calculate denominator (sum of exp(utility) + no-purchase option)
        denominator = hotels['exp_utility'].sum() + np.exp(-max_utility)  # no-purchase has utility 0
        
        # If there was a booking
        if (hotels['booking_bool'] == 1).any():
            # Get the booked hotel
            booked_hotel = hotels[hotels['booking_bool'] == 1].iloc[0]
            
            # Add to log-likelihood: log(P(j|S))
            log_likelihood += (booked_hotel['utility'] - max_utility) - np.log(denominator)
        else:
            # No booking - customer chose outside option
            log_likelihood += (-max_utility) - np.log(denominator)
    
    return -log_likelihood  # Negative because we're minimizing

# Function to fit MNL model
def fit_mnl_model(data, search_groups, features):
    # Initial beta values (all zeros)
    initial_beta = np.zeros(len(features) + 1)  # +1 for intercept
    
    # Define the objective function for optimization
    def objective(beta):
        return mnl_log_likelihood(beta, data, search_groups, features)
    
    # Optimize using L-BFGS-B algorithm
    print("Starting optimization...")
    result = optimize.minimize(
        objective,
        initial_beta,
        method='L-BFGS-B',
        options={'disp': True, 'maxiter': 100}
    )
    
    print(f"Optimization completed: {result.success}")
    print(f"Final negative log-likelihood: {result.fun}")
    
    # Extract optimal beta values
    beta_opt = result.x
    
    return beta_opt

# Fit the MNL model
beta_opt = fit_mnl_model(data, search_groups, features)

# Print results
print("\nMNL Model Parameters:")
print(f"Intercept (β₀): {beta_opt[0]:.4f}")

feature_descriptions = {
    'prop_starrating': 'Star Rating (higher = better hotel quality)',
    'prop_review_score': 'Review Score (higher = better guest reviews)',
    'prop_brand_bool': 'Brand Flag (1 = branded hotel, 0 = independent)',
    'prop_location_score': 'Location Score (higher = better location)',
    'prop_accesibility_score': 'Accessibility Score (higher = better accessibility)',
    'prop_log_historical_price': 'Log Historical Price',
    'price_usd': 'Price in USD (per $1)',
    'promotion_flag': 'Promotion Flag (1 = has promotion, 0 = no promotion)'
}

for i, feature in enumerate(features):
    if feature == 'price_usd':
        # For price_usd, we need to adjust the interpretation because we scaled during calculation
        # The current coefficient is "per $100" because we divided price by 100 in calculations
        # To get "per $1", we divide by 100
        per_dollar_coef = beta_opt[i+1] / 100.0
        print(f"{feature_descriptions.get(feature, feature)} (β{i+1}): {per_dollar_coef:.6f}")
    else:
        print(f"{feature_descriptions.get(feature, feature)} (β{i+1}): {beta_opt[i+1]:.6f}")



Data loaded, shape: (153009, 15)
Starting optimization...


KeyboardInterrupt: 

## Problem 2

### Assumption:
- We use the $\beta$ from problem 1
- The only identifier for all the hotels in `data1.csv` is price

In [3]:
import pandas as pd
import numpy as np
df1 = pd.read_csv("data1.csv")
df2 = pd.read_csv("data2.csv")
df3 = pd.read_csv("data3.csv")
df4 = pd.read_csv("data4.csv")
betas = [ -2.8155,0.476158, 0.119901,  0.229825, 0.016356, 0.562822 , -0.037346, -0.007323, 0.454005]
# betas = []
# for i, feature in enumerate(features):
#     if feature == 'price_usd':
#         per_dollar_coef = beta_opt[i+1] / 100.0
#         print(f"{feature_descriptions.get(feature, feature)} (β{i+1}): {per_dollar_coef:.6f}")
#         betas.append(per_dollar_coef)
#     else:
#         print(f"{feature_descriptions.get(feature, feature)} (β{i+1}): {beta_opt[i+1]:.6f}")
#         betas.append(beta_opt[i+1])

In [9]:

def compute_preference_weight(betas, assortment):
    """
    Inputs:
        betas (List[int]): a list of 9 betas, include beta0
        assortment (DataFrame)
    Outputs:
        pref_weight (List[float]): a list of v_j for all the hotel in the assortment S 
    """
    pref_weights=[]
    for j, row in assortment.iterrows():
        hotel_features = row.tolist() # a list of x_ji values
        hotel_features.insert(0,1) # augmented by 1
        u_j = np.dot(hotel_features, betas)
        v_j = np.exp(u_j)
        pref_weights.append(v_j)
    return pref_weights

def expected_rev(assortment,betas):
    """
    Inputs:
        assortment (DataFrame)
    Outputs:
        expected_rev (float)
    """
    expected_rev = 0
    pref_weights = compute_preference_weight(betas, assortment)
    print(f"pref_weights: {len(pref_weights)}")
    for j,row in assortment.iterrows():
        pj = row["price_usd"]
        print(f"j={j}")
        expected_rev += pj * pref_weights[j]/(1+np.sum(pref_weights))
    return expected_rev

def optim_assortment(df, betas):
    """
    Inputs:
        df (DataFrame): all hotels and features 
        pref_weights (List[float]): a list of v_j for all the hotels in the assortment
    Outputs:
        sorted_assortment_rev (Dict[Str:Float]): a dict of assortment with revenue sorted by revenue
        
    """
    # sort by price
    df_sorted = df.sort_values(by="price_usd", ascending=False)

    assortment_rev = dict() # {"j": rev}
    for j, row in df_sorted.iterrows():
        # each hotel j
        new_assortment = df_sorted.nlargest(j, "price_usd")
        assortment_rev[str(j)] = expected_rev(new_assortment, betas)
    
    sorted_assortment_rev = dict(sorted(assortment_rev.items(), key=lambda x: x[1], reverse=True))
    opt_assortment, opt_rev = sorted_assortment_rev[0]
    print(f"include the first {opt_assortment} hotels, the optimal revenue is {opt_rev}")
    return sorted_assortment_rev
    
# compute the optimal assortment for each doc

optim_assortment(df1, betas)
# optim_assortment(df2, betas)
# optim_assortment(df3, betas)
# optim_assortment(df4, betas)




pref_weights: 23
j=23


IndexError: list index out of range