# Problem Set 2
D200, ML in Economics
Benjamin L. Brückner

In [42]:
# 0. Load packages
from choice_learn.datasets import load_expedia
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from scikeras.wrappers import KerasClassifier

### 1. Problem

- **1.a.** Reading up if necessary, provide a brief explanation of the Conditional Logit model and
its use in discrete choice modeling.


*Answer.*

In a conditional logit model, a number ($J$) of different options are given, each yielding a utility ($U_{j}$):

$$
U_{i,j} = X_{i,j}\beta + \epsilon_{i,j}
$$

where $X$ is a vector of unobserved characteristics and $\beta$ their contribution to the utility ($U$). The goal of the conditional logit model is to choose the alternative yielding the highest utility.

$$
y = arg.max_{j \in J}  U_{i,j}
$$

The resulting probability for choosing a given option is given as:

$$
P(y_{i} = j) = \frac{exp(X_{i,j}\beta)}{\sum_{k=1}^{J}exp(X_{i,k}\beta)}
$$

Interestingly, the probability of choosing a given alternative is independent of irrelevant alternatives. The conditional logit model thus helps us model which discrete choice maximises one's utility based on its underlying contributing factors ($X$).




**1.b.** 
- Load the Expedia dataset using choice_learn.datasets.load_expedia(preprocessing="rumnet").


In [43]:
data = pd.read_csv("expedia.csv") # I had to improvise here, as load_expedia() was not working
print(data.columns)

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate'

Discard all but the first 5000 choices (for computing efficiency), and split the data into a
training and test set.

In [44]:
# Select the first 5000 rows
df = data.copy()

# Select the columns (for 1.c.)
columns = [
    "click_bool",
    "price_usd",
    "prop_starrating",
    "prop_review_score",
    "prop_brand_bool",
    "prop_location_score1"
]

# Select the columns
df = df[columns]
df["log_price_usd"] = np.log(df["price_usd"])
df = df.drop(columns=["price_usd"])
df = df.dropna()
df = df.iloc[:5000]

# Define the features and target
features = ["log_price_usd", "prop_starrating", "prop_review_score", "prop_brand_bool", "prop_location_score1"]
target = "click_bool"

# Set a seed for the RNG
np.random.seed(0)

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# Create a train/test split
train = df.iloc[:4000]
test = df.iloc[4000:]

X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

print(X_train)
print(y_train)

  result = getattr(ufunc, method)(*inputs, **kwargs)


      log_price_usd  prop_starrating  prop_review_score  prop_brand_bool  \
0          3.332919                3                3.5                0   
1          4.007333                3                3.5                1   
2          6.061457                4                4.0                1   
3          4.844187                3                4.0                1   
4          4.884089                4                4.5                1   
...             ...              ...                ...              ...   
3995       5.003946                4                4.5                0   
3996       4.442651                2                5.0                1   
3997       5.602119                4                4.0                1   
3998       4.382027                3                3.5                1   
3999       5.823046                3                3.5                0   

      prop_location_score1  
0                     4.47  
1                     1.10  


Look at the dataset and its documentation online and describe the
dataset’s structure and the variables it contains.

*Answer.*

A full documentation can be found here... that should be fairly comprehensive.

https://www.kaggle.com/c/expedia-personalized-sort/data

- **1.c.**  Write down a sensible model specification for the Conditional Logit model for the Expedia
dataset, for examples using the hotel features.

*Answer.*

A sensible model specification may be:

$$
U_{i,j} = X_{i,j}\beta + \epsilon_{i,j}
$$

where $X$ contains the columns log(price), star rating, review, whether the hotel is a brand (binary) and a location desirability score. The target variable ($y$) is the $click bool$, indicating whether the customer clicked on a hotel.


- **1.d.** Fit your conditional logit model to the Expedia data and report the cross-entropy loss
on the test data using TensorFlow’s tf.keras.losses.CategoricalCrossentropy loss function.

In [47]:
# Function to create a Conditional Logit Model
def build_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(y_train.shape[1], activation="softmax")  # Softmax for multinomial choice
    ])
    model.compile(optimizer="adam", loss=tf.keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])
    return model

# Wrap the model using KerasClassifier
model_wrapper = KerasClassifier(build_fn=build_model, epochs=50, batch_size=32, verbose=1)

# Define a Pipeline with StandardScaler and the model
pipeline = Pipeline([
    ("scaler", StandardScaler()),  # Standardizes the features
    ("model", model_wrapper)       # Keras model as the final step
])

In [None]:

# Train the pipeline
pipeline.fit(X_train, y_train)

# Predict probabilities
y_pred = pipeline.predict_proba(X_test)

# Compute cross-entropy loss
loss_fn = tf.keras.losses.CategoricalCrossentropy()
loss = loss_fn(y_test, y_pred).numpy()

print(f"Cross-Entropy Loss on Test Data: {loss:.4f}")

  X, y = self._initialize(X, y)


ImportError: cannot import name 'clip_to_image_size' from 'keras.src.layers.preprocessing.image_preprocessing.bounding_boxes.converters' (/Users/benjamin/opt/anaconda3/envs/d200/lib/python3.12/site-packages/keras/src/layers/preprocessing/image_preprocessing/bounding_boxes/converters.py)