In [5]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval
from matplotlib.colors import ListedColormap, BoundaryNorm, Normalize
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import umap.umap_ as umap
import matplotlib

matplotlib.rc('font', family='Heiti TC')

In [2]:
# read data from csv
df = pd.read_csv('test.csv')

In [None]:
## convert embeddings as numpy array
df["embeddings"] = df.embeddings.apply(literal_eval).apply(np.array)

In [19]:
## calculate median of embeddings among the same place_id
median = df.groupby('place_id')['embeddings'].agg(lambda x: np.median(np.array(list(x)), axis=0)).reset_index()


In [20]:
median

Unnamed: 0,place_id,embeddings
0,ChIJ-4CmT6WuQjQRBRe2tNOILh0,"[-0.008874298073351383, -0.013414854183793068,..."
1,ChIJ-4eST8arQjQR4NqXbdGb_aU,"[0.015194365754723549, -0.02273879013955593, -..."
2,ChIJ-QdKpOurQjQRbHjcDMWkdL0,"[0.0029082802357152104, -0.007561947451904416,..."
3,ChIJ-RKj6ralQjQRG7Yjm1KoVHY,"[-0.0019662256818264723, -0.010632794350385666..."
4,ChIJ-SQqF82rQjQRN0ZxOki-HWU,"[-0.009688245598226786, -0.01176051003858447, ..."
...,...,...
622,ChIJzQozOQUXbjQR3bU10EyZrTQ,"[-0.0021252273581922054, -0.006608151132240891..."
623,ChIJzRhSxFypQjQR6SbAuChqYXQ,"[-0.0036726826801896095, -0.011941013857722282..."
624,ChIJzcy5eRSpQjQRLU57xTHyiME,"[0.0005860806413693354, -0.01316858734935522, ..."
625,ChIJzxnCGWCpQjQRtvEHWFzNEz0,"[0.001134225050918758, -0.013333929236978292, ..."


In [26]:
## create a place-to-place similarity DF
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Create a matrix of median embeddings
embeddings_matrix = np.vstack(median['embeddings'].values)

# Step 2: Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings_matrix)

# Step 3: Convert the similarity matrix to a DataFrame
place_ids = median['place_id'].tolist()
np.fill_diagonal(similarity_matrix, -np.inf)

# get the indices of the top 5 elements, excluding the diagonal
top_5_indices = np.argpartition(similarity_matrix, -5, axis=1)[:, -5:]

# create a list for the new DataFrame
rows_list = []

# iterate through each place_id and its top 5 indices
for idx, indices in enumerate(top_5_indices):
    place_id1 = place_ids[idx]
    for index in indices:
        place_id2 = place_ids[index]
        similarity = similarity_matrix[idx][index]
        # Create a dictionary for the row and append to the list
        row = {'place_id1': place_id1, 'place_id2': place_id2, 'similarity': similarity}
        rows_list.append(row)

# create a DataFrame from the list of rows
similarity_df = pd.DataFrame(rows_list)

# sort by place_id1 and similarity score to get the top 5 per place_id1
similarity_df = similarity_df.sort_values(by=['place_id1', 'similarity'], ascending=[True, False])

top_5_similarity_df = similarity_df.groupby('place_id1').head(5).reset_index(drop=True)

top_5_similarity_df

Unnamed: 0,place_id1,place_id2,similarity
0,ChIJ-4CmT6WuQjQRBRe2tNOILh0,ChIJT2HPWp89aTQRdEhufJsz38o,0.972897
1,ChIJ-4CmT6WuQjQRBRe2tNOILh0,ChIJKb-sdjDJbjQRXC10O4l5MNw,0.972789
2,ChIJ-4CmT6WuQjQRBRe2tNOILh0,ChIJvwHLW409aTQR6sc-hS3hG5I,0.969981
3,ChIJ-4CmT6WuQjQRBRe2tNOILh0,ChIJZxZhc3ypQjQRu17X6qIea1g,0.969182
4,ChIJ-4CmT6WuQjQRBRe2tNOILh0,ChIJbdWdGTunQjQR5NWpzoTWfuA,0.969055
...,...,...,...
3130,ChIJzy7rpAcdaDQRjCFbE88ktT0,ChIJS_b4z7MPaTQRkptGlM4_PLQ,0.984700
3131,ChIJzy7rpAcdaDQRjCFbE88ktT0,ChIJidktxYQXaTQR8w8yfVwCQyU,0.982228
3132,ChIJzy7rpAcdaDQRjCFbE88ktT0,ChIJqTxNMRCnQjQR3bkYmKwA3T4,0.982146
3133,ChIJzy7rpAcdaDQRjCFbE88ktT0,ChIJczEQFT5OXTQRtFZzyAk44SI,0.981643


In [27]:
from Database import MongoDBConnection
## connect to cloud MongoDB
try:
    connection = MongoDBConnection()
    collection = connection.get_collection('ramen_info')
except Exception as e:
    print(e)

In [29]:
from pymongo import UpdateOne
operations = []
for place_id1, group_df in top_5_similarity_df.groupby('place_id1'):
    # Extract the top 5 place_id2 as a list
    top_places = group_df['place_id2'].tolist()
    # Prepare the update operation
    operation = UpdateOne(
        {'place_id': place_id1},
        {'$set': {'top_similar_places': top_places}},
        upsert=True
    )
    # Add the operation to the list
    operations.append(operation)


In [30]:
operations

[UpdateOne({'place_id': 'ChIJ-4CmT6WuQjQRBRe2tNOILh0'}, {'$set': {'top_similar_places': ['ChIJT2HPWp89aTQRdEhufJsz38o', 'ChIJKb-sdjDJbjQRXC10O4l5MNw', 'ChIJvwHLW409aTQR6sc-hS3hG5I', 'ChIJZxZhc3ypQjQRu17X6qIea1g', 'ChIJbdWdGTunQjQR5NWpzoTWfuA']}}, True, None, None, None),
 UpdateOne({'place_id': 'ChIJ-4eST8arQjQR4NqXbdGb_aU'}, {'$set': {'top_similar_places': ['ChIJecbRWuaVbjQR6RZ8DcottTU', 'ChIJtS7PPxA9aTQRtHuysWag42M', 'ChIJ16nndKKrQjQREv9guZ5wBWY', 'ChIJG391PgAfaDQRUfWYYZensnQ', 'ChIJP9viC28DaDQRTBokHHuK9kc']}}, True, None, None, None),
 UpdateOne({'place_id': 'ChIJ-QdKpOurQjQRbHjcDMWkdL0'}, {'$set': {'top_similar_places': ['ChIJ11E1o3OrQjQRSWCo0KZNkxo', 'ChIJHxY9HQmpQjQRm2UjwSxxR1U', 'ChIJ4QFGQPypQjQRiaC4DF4togs', 'ChIJh4d5b7CuQjQRvEGhf7bj0oQ', 'ChIJsckYCqqpQjQRCI1PN8UrXFY']}}, True, None, None, None),
 UpdateOne({'place_id': 'ChIJ-RKj6ralQjQRG7Yjm1KoVHY'}, {'$set': {'top_similar_places': ['ChIJe67raCx3bjQRAhj4qrkZHqo', 'ChIJ07iGYR4CaDQRNeguVrgLMkU', 'ChIJGSBm_z0iaDQROevH6vvEytA', 'C

In [31]:
## upsert top_similar_places into MongoDB by their place_id
result = collection.bulk_write(operations,ordered=False)