Embedding the "places.csv" file (each place gets a vector using sentence-transformers library)

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load data
data = pd.read_csv('files/p2/datathon_p2/places.csv')

# Preprocessing: Combine columns to form one string per location
def preprocess_row(row):
    tags = row['tags'].strip('{}').replace('_', ' ').replace(',', ' ')
    neighborhood = row['neighborhood'] if pd.notnull(row['neighborhood']) else ''
    emoji = row['emoji'] if pd.notnull(row['emoji']) else ''
    text = f"{row['name']} {row['short_description']} {tags} {neighborhood} {emoji}"
    return text.strip()


data['preprocessed_text'] = data.apply(preprocess_row, axis=1)

# Initialize model, the model being used is a DEFAULT (this model can be changed)
# Other models may provide better semantic/contextual meaning to each sentence
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed the preprocessed text
embeddings = model.encode(data['preprocessed_text'].tolist())

# Optionally: Save the embeddings alongside original data
import numpy as np
embedding_df = pd.DataFrame(embeddings)
data_with_embeddings = pd.concat([data, embedding_df], axis=1)
data_with_embeddings.to_csv('places_with_embeddings.csv', index=False)

# Now you have embeddings that you can use for similarity search


RuntimeError: Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
Traceback (most recent call last):
  File "c:\Users\ajits\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

Embedding all the Review Data:

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# 1. Load the reviews CSV
reviews_df = pd.read_csv('files/p2/datathon_p2/reviews.csv')

# 2. Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# 3. Function to get mean review embedding for a place
# Embeds each review for each place, and then takes the avg vector from all reviews
# End result: One embedded vector for each location
def get_place_review_embedding(group):
    review_texts = group['review_text'].dropna().tolist()
    if len(review_texts) == 0:
        return np.zeros(model.get_sentence_embedding_dimension())  # Handle case with no reviews
    embeddings = model.encode(review_texts, normalize_embeddings=True)
    return np.mean(embeddings, axis=0)

# 4. Group by place_id and compute mean embedding
place_review_embeddings = reviews_df.groupby('place_id').apply(get_place_review_embedding)

# 5. (Optional) Convert to a DataFrame for easier handling
place_review_embeddings_df = pd.DataFrame(place_review_embeddings.tolist(), index=place_review_embeddings.index)

# 6. Save to a file if you want
place_review_embeddings_df.to_csv('place_review_embeddings.csv')

# print("Done! Now you have 1 review vector per place.")


  place_review_embeddings = reviews_df.groupby('place_id').apply(get_place_review_embedding)


Done! Now you have 1 review vector per place.


Sorting the embedded review vectors in order

In [6]:
import pandas as pd

# Load your current review embeddings file
review_embeddings_df = pd.read_csv('files/p2/datathon_p2/place_review_embeddings.csv', index_col=0)

# Sort the index (place_id) numerically
review_embeddings_df = review_embeddings_df.sort_index(key=lambda x: x.str.extract('(\d+)').astype(int)[0])

# Save it back (overwriting the file or saving as a new one)
review_embeddings_df.to_csv('place_review_embeddings_sorted.csv')


making place data only embeddings, no text,

In [7]:
import pandas as pd

# Load your data
df = pd.read_csv('files/p2/datathon_p2/places_with_embeddings.csv')

# Keep only the embedding columns (which are the ones named with numbers)
embedding_columns = [str(i) for i in range(384)]
embeddings_only = df[embedding_columns]

# Save to a new CSV
embeddings_only.to_csv('embeddings_only.csv', index=False)


In [10]:
import pandas as pd

# Step 1: Load your structured embeddings
structured = pd.read_csv('files/p2/datathon_p2/place_structured.csv')

# Step 2: Load the place_ids from your review_embeddings.csv (or wherever you have them)
reviews = pd.read_csv('files/p2/datathon_p2/place_review_embeddings_sorted.csv')
place_ids = reviews['place_id']

# Step 3: Add the place_id column
structured['place_id'] = place_ids.values

# Step 4 (optional): Move place_id to be the first column
cols = ['place_id'] + [col for col in structured.columns if col != 'place_id']
structured = structured[cols]

# Step 5: Save the new CSV if you want
structured.to_csv('place_structured_with_ids.csv', index=False)


Combining structured and unstructured (reviews)

In [None]:
import pandas as pd
import numpy as np

# Load your CSVs (replace with actual paths)
reviews_df = pd.read_csv("files\p2\datathon_p2\combined_embeddings.csv")
locations_df = pd.read_csv("files\p2\datathon_p2\kruncated_media_embeddings.csv")

# Ensure both DataFrames are sorted by place_id
reviews_df = reviews_df.sort_values('place_id')
locations_df = locations_df.sort_values('place_id')

# Extract the columns that correspond to the embeddings (skip the 'place_id' column)
review_vectors = reviews_df.drop(columns='place_id').values
location_vectors = locations_df.drop(columns='place_id').values

# Ensure the vectors are of the same shape
assert review_vectors.shape == location_vectors.shape, "Embeddings have different dimensions!"

# Average the two vectors for each place_id
combined_vectors = (review_vectors + location_vectors) / 2

# Add the 'place_id' column back
combined_df = pd.DataFrame(combined_vectors, columns=[f'feature_{i}' for i in range(combined_vectors.shape[1])])
combined_df['place_id'] = reviews_df['place_id']

cols = ['place_id'] + [col for col in combined_df.columns if col != 'place_id']
combined_df = combined_df[cols]

# Save the combined DataFrame to a new CSV
combined_df.to_csv('FINAL_COMBINED.csv', index=False)



OSError: [Errno 22] Invalid argument: 'files\\p2\\datathon_p2\truncated_media_embeddings.csv'

Image embedding

In [18]:
import pandas as pd
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch, requests, io
from pathlib import Path

CSV_IN  = Path("files\\p2\\datathon_p2\\media.csv")
CSV_OUT = Path("places_with_image_embeddings.csv")

df = pd.read_csv(CSV_IN)

# Load CLIP
clip_model     = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

def get_image_emb(url: str):
    """Given an image URL, fetch and embed the image using CLIP"""
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    img = Image.open(io.BytesIO(resp.content)).convert("RGB")
    inputs = clip_processor(images=img, return_tensors="pt")
    with torch.no_grad():
        emb = clip_model.get_image_features(**inputs)
    return emb[0].cpu().numpy()

# Function to process images for each place_id
def get_first_two_image_embeddings(group):
    """Embeds at most the first two images for each place_id."""
    image_embeddings = []
    for idx, row in group.iterrows():
        if len(image_embeddings) >= 2:
            break
        try:
            emb = get_image_emb(row["media_url"])
            image_embeddings.append(emb)
        except Exception as e:
            print(f"Error embedding image {row['media_url']}: {e}")
            continue
    return image_embeddings

print("About to start embedding images")

# Group by place_id and apply image embedding function
all_embeddings = []
for place_id, group in df.groupby("place_id"):
    image_embeddings = get_first_two_image_embeddings(group)
    
    # If embeddings exist, add to the result list
    if image_embeddings:
        embeddings_flat = [emb.tolist() for emb in image_embeddings]  # Flatten the embeddings
        # Pad to 2 images (with None or zeros) if fewer than 2
        while len(embeddings_flat) < 2:
            embeddings_flat.append([None] * len(image_embeddings[0]))  # Padding with None or zeros
        all_embeddings.append([place_id] + embeddings_flat)

# Convert all the embeddings to DataFrame
emb_df = pd.DataFrame(
    all_embeddings,
    columns=["place_id", "img_emb_0", "img_emb_1"]
)

# Merge original df (with only place_id and media_url) with image embeddings
out = pd.merge(df[["place_id", "media_url"]], emb_df, on="place_id", how="right")

# Save to CSV
out.to_csv(CSV_OUT, index=False)
print(f"Wrote {CSV_OUT}")


About to start embedding images


KeyboardInterrupt: 

ALL combined embeddings

In [26]:
import pandas as pd

# Load the media_embeddings CSV file into a Pandas DataFrame
media_embeddings_df = pd.read_csv("files\p2\datathon_p2\media_embeddings.csv")

# Strip any leading or trailing spaces from column names
media_embeddings_df.columns = media_embeddings_df.columns.str.strip()

# Remove the 'place_' prefix and convert to integer type
media_embeddings_df['place_id'] = media_embeddings_df['place_id'].str.replace('place_', '').astype(int)

# Extract place_ids from the DataFrame
place_ids_media = media_embeddings_df['place_id'].values

# Expected place_ids range from 1 to 1500
expected_place_ids = set(range(1, 1501))

# Actual place_ids in media embeddings
actual_place_ids = set(place_ids_media)

# Find the missing place_id(s)
missing_place_ids = expected_place_ids - actual_place_ids

print("Missing place_id(s):", missing_place_ids)


Missing place_id(s): {1170}


In [29]:
import pandas as pd

# Load the media_embeddings CSV file into a Pandas DataFrame
media_embeddings_df = pd.read_csv("files\p2\datathon_p2\media_embeddings.csv")

# Strip any leading or trailing spaces from column names
media_embeddings_df.columns = media_embeddings_df.columns.str.strip()

# Remove the 'place_' prefix and convert to integer type for place_id
media_embeddings_df['place_id'] = media_embeddings_df['place_id'].str.replace('place_', '').astype(int)

# Drop the 'media_url' column
media_embeddings_df = media_embeddings_df.drop(columns=['media_url'])

# Save the updated DataFrame to a new CSV file without the 'media_url' column
media_embeddings_df.to_csv('media_embeddings_no_url.csv', index=False)

# Optionally, print the first few rows of the new DataFrame to confirm
print(media_embeddings_df.head())



   place_id  img_emb_0  img_emb_1  img_emb_2  img_emb_3  img_emb_4  img_emb_5  \
0         1  -0.081985   0.172941   0.041037   0.176305   0.160374  -0.208215   
1         2   0.049195   0.326742   0.302598   0.125595  -0.136061   0.116439   
2         3  -0.479146   0.542403  -0.465966   0.295334   0.247133   0.062442   
3         4   0.291114  -0.070939   0.313817   0.026109  -0.069203  -0.023226   
4         5  -0.699818   0.156307   0.039564   0.249764   0.676135  -0.315518   

   img_emb_6  img_emb_7  img_emb_8  ...  img_emb_502  img_emb_503  \
0   0.219407   0.459320   0.010887  ...     0.011966     0.242671   
1   0.151134   0.097144  -0.118955  ...     0.061003     0.031141   
2   0.389605   0.492652  -0.219052  ...     0.025372     0.197852   
3   0.155666   0.401505  -0.339249  ...    -0.185449    -0.000370   
4   0.414011   0.533526  -0.256194  ...     0.138759     0.397147   

   img_emb_504  img_emb_505  img_emb_506  img_emb_507  img_emb_508  \
0     0.314136     0.034457 

In [33]:
import pandas as pd

# Load the media embeddings
media_embeddings_df = pd.read_csv("files/p2/datathon_p2/media_embeddings_no_url.csv")

# Assuming 'place_id' is the first column, and the embeddings start from the second column
media_embeddings_df.set_index('place_id', inplace=True)

# Check the original shape
print("Original Media Embeddings Shape: ", media_embeddings_df.shape)

# Truncate the media embeddings from 512D to 384D by selecting only the first 384 dimensions
truncated_media_embeddings_df = media_embeddings_df.iloc[:, :384]

# Check the new shape
print("Truncated Media Embeddings Shape: ", truncated_media_embeddings_df.shape)

# Save the truncated embeddings to a new CSV
truncated_media_embeddings_df.to_csv("truncated_media_embeddings.csv")
print("Truncated embeddings saved to 'truncated_media_embeddings.csv'")


Original Media Embeddings Shape:  (1499, 512)
Truncated Media Embeddings Shape:  (1499, 384)
Truncated embeddings saved to 'truncated_media_embeddings.csv'


In [35]:
import pandas as pd
import numpy as np

# Load the media embeddings and combined embeddings
media_embeddings_df = pd.read_csv("files/p2/datathon_p2/truncated_media_embeddings.csv")
combined_embeddings_df = pd.read_csv("files/p2/datathon_p2/combined_embeddings.csv")

# Assuming 'place_id' is the first column, and the embeddings start from the second column
media_embeddings_df.set_index('place_id', inplace=True)
combined_embeddings_df.set_index('place_id', inplace=True)

# Check the shapes of the dataframes
print("Media Embeddings Shape: ", media_embeddings_df.shape)
print("Combined Embeddings Shape: ", combined_embeddings_df.shape)

# Ensure both have the same number of rows (place_ids)
assert media_embeddings_df.shape[0] == combined_embeddings_df.shape[0], "Number of place_ids don't match!"

# Now calculate the average of the embeddings for each place_id
# Element-wise sum of the two embeddings and divide by 2 for average
final_embeddings_df = pd.DataFrame(index=media_embeddings_df.index)

# Element-wise sum and averaging
final_embeddings_df['combined_embedding'] = (media_embeddings_df.values + combined_embeddings_df.values) / 2

# Save the final averaged embeddings to a CSV
final_embeddings_df.to_csv("final_combined_embeddings.csv")
print("Final averaged embeddings saved to 'final_combined_embeddings.csv'")


Media Embeddings Shape:  (1499, 384)
Combined Embeddings Shape:  (1499, 384)


ValueError: Expected a 1D array, got an array with shape (1499, 384)