Embedding the "places.csv" file (each place gets a vector using sentence-transformers library)

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load data
data = pd.read_csv('files/p2/datathon_p2/places.csv')

# Preprocessing: Combine columns to form one string per location
def preprocess_row(row):
    tags = row['tags'].strip('{}').replace('_', ' ').replace(',', ' ')
    neighborhood = row['neighborhood'] if pd.notnull(row['neighborhood']) else ''
    emoji = row['emoji'] if pd.notnull(row['emoji']) else ''
    text = f"{row['name']} {row['short_description']} {tags} {neighborhood} {emoji}"
    return text.strip()


data['preprocessed_text'] = data.apply(preprocess_row, axis=1)

# Initialize model, the model being used is a DEFAULT (this model can be changed)
# Other models may provide better semantic/contextual meaning to each sentence
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed the preprocessed text
embeddings = model.encode(data['preprocessed_text'].tolist())

# Optionally: Save the embeddings alongside original data
import numpy as np
embedding_df = pd.DataFrame(embeddings)
data_with_embeddings = pd.concat([data, embedding_df], axis=1)
data_with_embeddings.to_csv('places_with_embeddings.csv', index=False)

# Now you have embeddings that you can use for similarity search


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Embedding all the Review Data:

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# 1. Load the reviews CSV
reviews_df = pd.read_csv('files/p2/datathon_p2/reviews.csv')

# 2. Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# 3. Function to get mean review embedding for a place
# Embeds each review for each place, and then takes the avg vector from all reviews
# End result: One embedded vector for each location
def get_place_review_embedding(group):
    review_texts = group['review_text'].dropna().tolist()
    if len(review_texts) == 0:
        return np.zeros(model.get_sentence_embedding_dimension())  # Handle case with no reviews
    embeddings = model.encode(review_texts, normalize_embeddings=True)
    return np.mean(embeddings, axis=0)

# 4. Group by place_id and compute mean embedding
place_review_embeddings = reviews_df.groupby('place_id').apply(get_place_review_embedding)

# 5. (Optional) Convert to a DataFrame for easier handling
place_review_embeddings_df = pd.DataFrame(place_review_embeddings.tolist(), index=place_review_embeddings.index)

# 6. Save to a file if you want
place_review_embeddings_df.to_csv('place_review_embeddings.csv')

# print("Done! Now you have 1 review vector per place.")


  place_review_embeddings = reviews_df.groupby('place_id').apply(get_place_review_embedding)


Done! Now you have 1 review vector per place.


Sorting the embedded review vectors in order

In [6]:
import pandas as pd

# Load your current review embeddings file
review_embeddings_df = pd.read_csv('files/p2/datathon_p2/place_review_embeddings.csv', index_col=0)

# Sort the index (place_id) numerically
review_embeddings_df = review_embeddings_df.sort_index(key=lambda x: x.str.extract('(\d+)').astype(int)[0])

# Save it back (overwriting the file or saving as a new one)
review_embeddings_df.to_csv('place_review_embeddings_sorted.csv')


making place data only embeddings, no text

In [7]:
import pandas as pd

# Load your data
df = pd.read_csv('files/p2/datathon_p2/places_with_embeddings.csv')

# Keep only the embedding columns (which are the ones named with numbers)
embedding_columns = [str(i) for i in range(384)]
embeddings_only = df[embedding_columns]

# Save to a new CSV
embeddings_only.to_csv('embeddings_only.csv', index=False)
