In [37]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm  # Import tqdm for progress bar

# Load the dataset
file_path = '/Users/arjunathreya/Projects/airbnb_similar_listings/ml/data/dataset/cleaned_listings.csv'
df = pd.read_csv(file_path)

# Display initial data information
print("Initial Data Overview:")
df.info()

Initial Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37542 entries, 0 to 37541
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            37542 non-null  object 
 1   listing_url                   37542 non-null  object 
 2   price                         37542 non-null  float64
 3   property_type                 37542 non-null  object 
 4   room_type                     37542 non-null  object 
 5   neighborhood_overview         20892 non-null  object 
 6   bathrooms_text                37510 non-null  object 
 7   bedrooms                      37542 non-null  float64
 8   beds                          37541 non-null  float64
 9   accommodates                  37541 non-null  float64
 10  latitude                      37541 non-null  float64
 11  longitude                     37541 non-null  float64
 12  neighbourhood_group_cleansed  37541 n

  df = pd.read_csv(file_path)


In [38]:
def create_overview(row):
    # Extract relevant fields
    property_type = row['property_type']
    room_type = row['room_type']
    price = row['price']
    bathrooms = row['bathrooms_text']
    bedrooms = int(row['bedrooms']) if not pd.isnull(row['bedrooms']) else 'N/A'
    beds = int(row['beds']) if not pd.isnull(row['beds']) else 'N/A'
    accommodates = int(row['accommodates']) if not pd.isnull(row['accommodates']) else 'N/A'
    neighborhood = row['neighbourhood_cleansed']
    review_rating = row['review_scores_rating']
    
    # Construct the description
    description_text = (
        f"This {bedrooms}-bedroom {property_type} is located in {neighborhood}. "
        f"The {room_type} accommodates {accommodates} guests with {beds} bed(s) and "
        f"{bathrooms}. "
    )
    
    # Adding price details
    if not pd.isnull(price):
        description_text += f"The price per night is ${price:.2f}. "

    # Add review information if available
    if not pd.isnull(review_rating):
        description_text += (
            f"The property has a review rating of {review_rating:.1f}/5"
        )
    
    return description_text.strip()

# Apply the function to the DataFrame and create a new column 'description_summary'
df['description_summary'] = df.apply(create_overview, axis=1)

# View the updated DataFrame with the new column
print(df[['id', 'description_summary']].head())
print(df['description_summary'].iloc[0])

                    id                                description_summary
0   739333866230665371  This 1-bedroom Private room in rental unit is ...
1   572612125615500056  This 1-bedroom Private room in rental unit is ...
2             45267941  This 1-bedroom Private room in rental unit is ...
3   838141198693830649  This 3-bedroom Entire rental unit is located i...
4  1082660771919357919  This 4-bedroom Entire home is located in South...
This 1-bedroom Private room in rental unit is located in Fort Hamilton. The Private room accommodates 1 guests with 1 bed(s) and 1 shared bath. The price per night is $89.00. The property has a review rating of 4.7/5


In [41]:
def evaluate_listing(row):
    # Extract relevant fields
    overall_rating = row['review_scores_rating']
    cleanliness = row['review_scores_cleanliness']
    checkin = row['review_scores_checkin']
    communication = row['review_scores_communication']
    location = row['review_scores_location']
    value = row['review_scores_value']
    description = row['description'] if not pd.isnull(row['description']) else "No detailed description available."

    # Start the outline with the property description
    outline = f"Property Description: {description} Review Overview:"
    
    # Evaluate cleanliness
    if cleanliness >= 4.5:
        outline += "• The property is consistently rated highly for cleanliness."
    elif cleanliness >= 3.5:
        outline += "• The property is generally clean but could use some improvement."
    else:
        outline += "• Cleanliness is often highlighted as a concern by guests."
    
    # Evaluate the check-in experience
    if checkin >= 4.5:
        outline += "• The check-in process is rated as smooth and easy by most guests."
    elif checkin >= 3.5:
        outline += "• The check-in process is generally okay, but there might be occasional issues."
    else:
        outline += "• Guests frequently report issues with the check-in process."
    
    # Evaluate communication
    if communication >= 4.5:
        outline += "• The host is highly responsive and easy to communicate with."
    elif communication >= 3.5:
        outline += "• Communication with the host is generally fine, with some areas for improvement."
    else:
        outline += "• Guests have often faced difficulties in communicating with the host."
    
    # Evaluate the location
    if location >= 4.5:
        outline += "• The location is highly rated by guests, with many finding it convenient."
    elif location >= 3.5:
        outline += "• The location is generally good but may not be ideal for everyone."
    else:
        outline += "• The location might not be convenient or desirable for many guests."
    
    # Evaluate the value for money
    if value >= 4.5:
        outline += "• Guests believe the property offers excellent value for money."
    elif value >= 3.5:
        outline += "• The property offers reasonable value, though some guests may feel it's a bit pricey."
    else:
        outline += "• Guests feel the property does not offer good value for the price."
    
    # Compare overall rating and scores
    if (overall_rating >= 4.5 and cleanliness >= 4.5 and checkin >= 4.5 
        and communication >= 4.5 and location >= 4.5 and value >= 4.5):
        outline += "Overall, the reviews suggest that the property consistently meets or exceeds expectations."
    else:
        outline += "There are some areas where guest experiences may not fully align with the description, particularly in the aspects highlighted above."
    
    return outline

# Apply the function to the DataFrame and create a new column 'property_outline'
df['property_outline'] = df.apply(evaluate_listing, axis=1)

# View the updated DataFrame with the new column
print(df[['id', 'property_outline']].head())
print(df['property_outline'].iloc[0])

                    id                                   property_outline
0   739333866230665371  Property Description: Lovely vocation room, ha...
1   572612125615500056  Property Description: Cozy room in a charming ...
2             45267941  Property Description: No detailed description ...
3   838141198693830649  Property Description: No detailed description ...
4  1082660771919357919  Property Description: 425 10th Street is what ...
Property Description: Lovely vocation room, has work desk , tv, 2 windows , drawer, closet. Shared bathroom and kitchen. Kitchen includes everything a kitchen needs. Close to transportation and the park/ocean. Bars restaurants are walking distance 5 minutes Review Overview:• The property is consistently rated highly for cleanliness.• The check-in process is rated as smooth and easy by most guests.• The host is highly responsive and easy to communicate with.• The location is highly rated by guests, with many finding it convenient.• Guests believe the 

In [40]:
# Sample function to construct a full description
def construct_high_level_overview(row):
    """
    Construct a full host description by combining property type, description, 
    and neighborhood overview.
    :param row: A row from the DataFrame.
    :return: A natural language description.
    """
    # Start with property type
    property_desc = f"This is a {row['property_type']}."

    # Add property description
    if pd.notnull(row['description']):
        property_desc += f" {row['description']}"
    
    # Add neighborhood overview if available
    if pd.notnull(row['neighborhood_overview']):
        property_desc += f" The neighborhood is described as: {row['neighborhood_overview']}"
    
    return property_desc

# Assuming df is your DataFrame
df['high_level_overview'] = df.apply(construct_high_level_overview, axis=1)
print(df['high_level_overview'].iloc[1])

print(df.iloc[0])

This is a Private room in rental unit. Cozy room in a charming Sunset Park apartment. Room has a full bed (always) fresh sheets, 4 pillows, clothing rack, desk, nightstand, iron, towels, TV, air conditioning, free access to WiFi, shared kitchen and living room. Images have more details.<br /><br />Neighborhood has great spanish and asian restaurants. A short walk to Bay Ridge, wonderful middle eastern food. N/R Subway one block away. Close distance to the Bay Ridge Promenade & Industry City. <br />Cheers!<br />LGBT+
id                                                             739333866230665371
listing_url                       https://www.airbnb.com/rooms/739333866230665371
price                                                                        89.0
property_type                                         Private room in rental unit
room_type                                                            Private room
neighborhood_overview                                               

In [42]:
import torch

# Load the Sentence Transformer model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') 

# for m1 mac
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
model.to(device)

# Function to generate embeddings in batches with a progress bar
def generate_embeddings_with_progress(df, column_name, batch_size=32):
    embeddings = []
    tqdm.pandas(desc=f"Generating embeddings for {column_name}")  # Setup progress bar description

    # Generate embeddings in batches
    for i in tqdm(range(0, len(df), batch_size), desc="Batch Encoding"):
        batch = df[column_name].iloc[i:i + batch_size].tolist()  # Select a batch of texts
        batch_embeddings = model.encode(batch, convert_to_tensor=True)  # Encode the batch
        embeddings.extend(batch_embeddings.tolist())  # Add embeddings to the list

    df[f'{column_name}_embedding'] = embeddings
    return df

# Generate embeddings for property_outline and description_summary
df = generate_embeddings_with_progress(df, 'property_outline')
df = generate_embeddings_with_progress(df, 'description_summary')
df = generate_embeddings_with_progress(df, 'high_level_overview')

# Function to compute weighted average of embeddings
def weighted_average_embedding(embeddings, weights):
    """
    Calculate the weighted average of a list of embeddings.
    :param embeddings: A list of embeddings (each embedding should be a numpy array or list).
    :param weights: A list of weights corresponding to each embedding.
    :return: The weighted average embedding as a numpy array.
    """
    # Convert the embeddings to numpy arrays
    embeddings = [np.array(embedding) for embedding in embeddings]
    
    # Compute the weighted sum of embeddings
    weighted_sum = np.sum([embedding * weight for embedding, weight in zip(embeddings, weights)], axis=0)
    
    # Normalize by dividing by the sum of the weights
    weighted_avg_embedding = weighted_sum / np.sum(weights)
    
    return weighted_avg_embedding

# Function to apply the weighted average embedding calculation for each row in the DataFrame
def apply_weighted_average(df, weight_outline=0.5, weight_description=0.3, weight_overview=0.2):
    """
    Apply weighted average embedding calculation for each row of the DataFrame.
    :param df: DataFrame containing embeddings columns.
    :param weight_outline: Weight for the 'property_outline_embedding'.
    :param weight_description: Weight for the 'description_summary_embedding'.
    :param weight_overview: Weight for the 'high_level_overview_embedding'.
    :return: The DataFrame with a new column 'average_embedding'.
    """
    weights = [weight_outline, weight_description, weight_overview]

    # Progress bar for calculating average embeddings
    tqdm.pandas(desc="Calculating weighted average embeddings")
    
    # Apply the weighted average embedding calculation row by row
    df['average_embedding'] = df.progress_apply(
        lambda row: weighted_average_embedding(
            [row['property_outline_embedding'], row['description_summary_embedding'], row['high_level_overview_embedding']],
            weights
        ), axis=1
    )
    
    return df

# Apply the function to calculate the weighted average of embeddings
df = apply_weighted_average(df, weight_outline=0.5, weight_description=0.3, weight_overview=0.2)

# View the DataFrame with the new 'average_embedding' column
print(df[['id', 'property_outline_embedding', 'description_summary_embedding', 'high_level_overview_embedding', 'average_embedding']].head())


Batch Encoding: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1174/1174 [07:27<00:00,  2.62it/s]
Batch Encoding: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1174/1174 [03:26<00:00,  5.70it/s]
Batch Encoding:  10%|██████████████████▊                                                                                                                                                                      | 119/1174 [00:54<08:01,  2.19it/s]


KeyboardInterrupt: 

In [18]:
# Save the DataFrame to a CSV file
df.to_csv('/Users/arjunathreya/Projects/airbnb_similar_listings/notebooks/listing_embeddings.csv', index=False)


In [23]:
from sklearn.cluster import DBSCAN

# Function to perform DBSCAN clustering on preprocessed data
def perform_clustering_dbscan(valid_vectors_df, similarity_threshold, min_samples):
    # Convert similarity threshold to DBSCAN's eps (epsilon) parameter 
    eps_value = 1 - similarity_threshold

    # Initialize DBSCAN with specified parameters
    db = DBSCAN(eps=eps_value, min_samples=min_samples, metric='cosine')
    
    # Fit the DBSCAN model and assign cluster labels
    valid_vectors_df['cluster'] = db.fit_predict(np.stack(valid_vectors_df['average_embedding'].values))

    # Extract the unique cluster labels
    labels = valid_vectors_df['cluster']
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    # Output the results of clustering
    print(f"Similarity Threshold: {similarity_threshold:.3f}, Min Samples: {min_samples} -> Number of clusters: {n_clusters}")

    # Create a mapping of cluster labels to listings
    cluster_groups = valid_vectors_df.groupby('cluster')['id'].apply(list).reset_index(name='listings_in_cluster')
    
    # Merge the cluster groups back to the valid_vectors_df
    valid_vectors_df = valid_vectors_df.merge(cluster_groups, on='cluster', how='left')

    return valid_vectors_df


min_samples = 2  # Define the minimum number of samples in a neighborhood for point classification

# Assume `listing_df` is your existing DataFrame with the `average_embedding` column
listing_df = ...  # Replace this with your actual DataFrame

# Iterate over a range of similarity thresholds to find the optimal clustering configuration
for threshold in np.arange(0.980, 0.800, -0.005):  # Adjust the range as needed
    # Filter out rows with null embeddings
    valid_vectors_df = df[df['average_embedding'].notnull()]
    result_df = perform_clustering_dbscan(valid_vectors_df, threshold, min_samples)

# Output the final DataFrame with clusters and their corresponding listings
print(result_df[['id', 'cluster', 'listings_in_cluster']].head())

Similarity Threshold: 0.950, Min Samples: 2 -> Number of clusters: 2127
Similarity Threshold: 0.945, Min Samples: 2 -> Number of clusters: 1985
Similarity Threshold: 0.940, Min Samples: 2 -> Number of clusters: 1817
Similarity Threshold: 0.935, Min Samples: 2 -> Number of clusters: 1655
Similarity Threshold: 0.930, Min Samples: 2 -> Number of clusters: 1463
                    id  cluster  \
0   739333866230665371        0   
1   572612125615500056        1   
2             45267941        2   
3   838141198693830649        2   
4  1082660771919357919       -1   

                                 listings_in_cluster  
0                     [739333866230665371, 33908121]  
1                     [572612125615500056, 45690330]  
2  [45267941, 838141198693830649, 53190949, 10202...  
3  [45267941, 838141198693830649, 53190949, 10202...  
4  [1082660771919357919, 13234457, 76008653636827...  


In [29]:
from sklearn.decomposition import PCA

import hdbscan

METRIC = 'euclidean'  # Metric used for HDBSCAN
# Function to reduce dimensionality of vector data using PCA
def apply_pca(valid_vectors_df, n_components):
    """
    Apply PCA to reduce the dimensionality of the vector data.
    :param valid_vectors_df: DataFrame containing the high-dimensional vectors.
    :param n_components: Number of principal components to use for PCA.
    :return: The dimensionality reduced data.
    """
    pca = PCA(n_components = 'mle', svd_solver = 'full')
    reduced_data = pca.fit_transform(np.stack(valid_vectors_df['average_embedding'].values))
    return reduced_data

# Function to perform clustering using HDBSCAN
def perform_clustering_hdbscan(reduced_data):
    """
    Perform clustering on the dimensionality reduced data using HDBSCAN.
    :param reduced_data: The dimensionality reduced data.
    :return: Cluster labels and the HDBSCAN clusterer object.
    """
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=2,
        min_samples=1,
        metric=METRIC,
        cluster_selection_epsilon=0.0  # Use the updated epsilon configuration
    )
    labels = clusterer.fit_predict(reduced_data)
    return labels, clusterer

n_components = 10  # Adjust based on your needs
    
# Apply PCA to reduce dimensionality
print("Applying PCA...")
reduced_data = apply_pca(df, n_components)

# Perform HDBSCAN clustering
print("Performing HDBSCAN clustering...")
labels, clusterer = perform_clustering_hdbscan(reduced_data)

valid_vectors_df_pca = df
# Add the cluster labels to the original DataFrame
valid_vectors_df_pca['cluster'] = labels

# Group listings by cluster
cluster_groups = valid_vectors_df_pca.groupby('cluster')['id'].apply(list).reset_index(name='listings_in_cluster')

# Merge cluster groups back to original DataFrame if needed
valid_vectors_df_pca = valid_vectors_df_pca.merge(cluster_groups, on='cluster', how='left')

# Output the results
print(valid_vectors_df_pca[['id', 'cluster', 'listings_in_cluster']].head())

Applying PCA...
Performing HDBSCAN clustering...
                    id  cluster  \
0   739333866230665371     2975   
1   572612125615500056     1224   
2             45267941       -1   
3   838141198693830649     5692   
4  1082660771919357919       -1   

                                 listings_in_cluster  
0                     [739333866230665371, 33908121]  
1                     [572612125615500056, 45690330]  
2  [45267941, 1082660771919357919, 10296660925320...  
3  [838141198693830649, 1127753693824936309, 8651...  
4  [45267941, 1082660771919357919, 10296660925320...  
