In [1]:
import pandas as pd

chunsize = 10**6
# Load the training data with chunk size to handle large dataset
Train_data = pd.read_csv('../data/train.csv',chunksize=chunsize)
Train_data = next(Train_data)
# Remove unnecessary columns for our model 
drop_columns = ['date_time', 'site_name', 'user_id', 'is_package', 'channel', 'cnt', 'srch_destination_id', 'srch_destination_type_id']
Train_data = Train_data.drop(drop_columns, axis=1)

# Filter data to only include actual bookings
Train_data = Train_data[Train_data['is_booking'] == 1]

# Define user-related feature columns
user_columns = ['posa_continent', 'user_location_country', 'user_location_region', 'user_location_city', 
                'orig_destination_distance', 'is_mobile', 'srch_ci', 'srch_co', 'srch_adults_cnt', 'srch_children_cnt', 
                'srch_rm_cnt']

# Define hotel-related feature columns
hotel_columns = ['hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster']

# Split data into user and hotel features
User_data = Train_data[user_columns]
Hotel_data = Train_data[hotel_columns]

# Define categorical columns for encoding
cat_cols_user = ['posa_continent', 'user_location_country', 'user_location_region', 'user_location_city']
cat_cols_hotel = ['hotel_continent', 'hotel_country', 'hotel_market']

# Convert categorical columns to numeric codes
for col in cat_cols_user:
    User_data[col] = User_data[col].astype('category').cat.codes
for col in cat_cols_hotel:
    Hotel_data[col] = Hotel_data[col].astype('category').cat.codes


# Convert check-in and check-out dates to datetime format
User_data['srch_ci'] = pd.to_datetime(User_data['srch_ci'])
User_data['srch_co'] = pd.to_datetime(User_data['srch_co'])

# Calculate reservation duration in days
User_data['Reservation_Time'] = (User_data['srch_ci'] - User_data['srch_co']).dt.days

# Drop the original date columns after extraction
User_data = User_data.drop(['srch_ci', 'srch_co'], axis=1)

# Remove rows with missing values
User_data.dropna(inplace=True)
Hotel_data.dropna(inplace=True)

# Ensure both datasets have matching indices
common_index = User_data.index.intersection(Hotel_data.index)

# Extract common data points using the aligned indices
User_data_common = User_data.loc[common_index]
Hotel_data_common = Hotel_data.loc[common_index]

User_data = User_data_common
Hotel_data = Hotel_data_common


import numpy as np
# Create a test dataset by randomly sampling 10,000 records
random_indices = np.random.choice(len(User_data), size=10000, replace=False)
test_User_data = User_data.iloc[random_indices]
test_Hotel_data = Hotel_data.iloc[random_indices]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  User_data[col] = User_data[col].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Hotel_data[col] = Hotel_data[col].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  User_data['srch_ci'] = pd.to_datetime(User_data['srch_ci'])
A value is trying to 

In [2]:

import torch
# Load pretrained two-tower model components
user_tower = torch.load("user_tower.pth", weights_only=False)
hotel_tower = torch.load("hotel_tower.pth", weights_only=False)

# Set device for model inference (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
user_tower.eval()
user_tower.to(device)
hotel_tower.eval()
hotel_tower.to(device)

DeepHotelTower(
  (fc1): Linear(in_features=4, out_features=64, bias=True)
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (fc2): Linear(in_features=64, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc5): Linear(in_features=64, out_features=32, bias=True)
)

In [3]:

def get_topk_similar_hotels(user_vec, encoded_hotel, topk=20):
    """
    Find the top-k most similar hotels to a given user vector.
    
    Parameters:
    user_vec: numpy vector of shape [32] representing encoded user features
    encoded_hotel: DataFrame or numpy array of shape [M, 32] representing encoded hotel features
    topk: number of top similar hotels to return
    
    Returns:
    List of (hotel_idx, similarity) tuples, sorted by similarity in descending order
    """
    # Convert hotel data to numpy array for faster computation
    hotel_array = encoded_hotel.values  # shape = (M, 32)
    
    # Calculate dot product between user vector and all hotel vectors
    dot_products = hotel_array.dot(user_vec)  # shape = (M, )
    similarities = dot_products

    # Get indices of top-k hotels with highest similarity scores
    # np.argsort returns indices in ascending order, so we negate similarities to get descending order
    topk_idx = np.argsort(-similarities)[:topk]
    topk_sims = similarities[topk_idx]
    
    # Return list of (hotel_idx, similarity) tuples
    return list(zip(topk_idx, topk_sims))

In [4]:

# Define columns to identify unique hotel combinations
columns_to_check = ['hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster']

# Get unique hotel feature combinations
Hotel_data_unique = Hotel_data[columns_to_check].drop_duplicates()

import numpy as np
# Encode hotel features using the hotel tower model
batch_size = 256
encoded_list = []

hotel_tower.eval()
hotel_tower.to(device)
# Convert DataFrame to NumPy array for batch processing
data_array = Hotel_data_unique.values.astype(np.float32)  # shape = (n, 4)
n_rows = data_array.shape[0]

with torch.no_grad():  # Disable gradient calculation to save memory
    for start_idx in range(0, n_rows, batch_size):
        end_idx = start_idx + batch_size
        batch_data = data_array[start_idx:end_idx]  # shape = (batch_size, 4)
        
        # Convert to PyTorch tensor and move to device
        batch_tensor = torch.from_numpy(batch_data)
        batch_tensor = batch_tensor.to(device)
        # Forward pass -> output shape (batch_size, 32)
        batch_output = hotel_tower(batch_tensor)
        
        # Convert back to NumPy and collect
        encoded_list.append(batch_output.cpu().numpy())

# Concatenate all batch outputs -> shape = (n, 32)
encoded_array = np.concatenate(encoded_list, axis=0)

# Convert to DataFrame with 32 embedding dimensions
encoded_hotel_data = pd.DataFrame(encoded_array)

# Save encoded hotel features for future use
encoded_hotel_data.to_csv('encoded_hotel_data.csv', index=False)

# Create mapping from hotel feature tuples to their indices in the encoded data
feature_to_idx = {}
for i in range(len(Hotel_data_unique)):
    row = Hotel_data_unique.iloc[i]  # Get the i-th row
    feats = (row["hotel_continent"], row["hotel_country"], row["hotel_market"], row["hotel_cluster"])
    feature_to_idx[feats] = i

In [5]:

# Begin model evaluation
# test_User_data: User validation data
# test_Hotel_data: Hotel validation data
# Hotel_data_unique: Unique hotel attribute combinations
# encoded_hotel_data: Encoded hotel embeddings
user_tower.eval()
hotel_tower.eval()
scores = []
correlation_rates = []  # Track correlation rates for all users

with torch.no_grad():
    for i in range(len(test_User_data)):
        # Step 1: Get user features and forward through the user tower
        user_feats = torch.tensor(test_User_data.iloc[i].values, dtype=torch.float32).unsqueeze(0).to(device)  # (1,4)
        user_vec = user_tower(user_feats).squeeze(0).cpu().numpy()
        
        # Step 2: Calculate similarity with encoded hotels and find top 20 matches
        top20 = get_topk_similar_hotels(user_vec, encoded_hotel_data, topk=20)  # [(h_idx, sim), ...]
        
        # Step 3: Get the user's actual hotel features from test data
        real_feat_tuple = tuple(test_Hotel_data.iloc[i].values)  # (feat1, feat2, feat3, feat4)
        
        # Step 4: Find the index of the actual hotel in our unique hotels dataset
        real_hotel_idx = feature_to_idx.get(real_feat_tuple, None)
        
        # Step 5: Check if the actual hotel appears in the top 20 list
        score = 0.0
        if real_hotel_idx is not None:
            # Iterate through top20 to find the rank of the actual hotel
            rank = None
            for rank_idx, (h_idx, sim) in enumerate(top20):
                # Check if this is the actual hotel
                if h_idx == real_hotel_idx:
                    rank = rank_idx
                    break
            if rank is not None:
                position = rank + 1  # rank starts from 0, so position is rank+1
                score = 1.0 / (np.log2(1/top20[rank][1]))  # Score based on similarity value
        
        # Calculate correlation between user vector and top20 hotel vectors
        correlated_count = 0
        top20_indices = [idx for idx, _ in top20]
        top20_hotel_vectors = encoded_hotel_data.iloc[top20_indices].values
        
        # Compute correlation with each hotel
        for hotel_vec in top20_hotel_vectors:
            # Calculate cosine similarity
            similarity = np.dot(user_vec, hotel_vec) / (np.linalg.norm(user_vec) * np.linalg.norm(hotel_vec))
            if similarity > 0.999995:
                correlated_count += similarity
        
        # Calculate correlation rate (average correlation across top 20)
        correlation_rate = correlated_count / 20
        correlation_rates.append(correlation_rate)
        scores.append(score)

# Calculate and display average correlation rate across all test users
average_correlation_rate = sum(correlation_rates) / len(correlation_rates)
print(f"Mean Average Precision: {average_correlation_rate:.32f}")

Mean Average Precision: 0.95305478572845458984375000000000
