In [43]:
import json
import numpy as np
import pandas as pd
import re

In [35]:
with open('../data/embedding_v1.2_short.json', 'r') as f:
    embeddings = json.load(f)

In [36]:
df = pd.read_csv('../data/recipes_selected_summarized.csv', index_col='RecipeId')

In [56]:
def calc_similarity(query_emb, embeddings):
    """
    Input:
        query_emb: a list representing a query embedding
        embeddings: a matrix with rows being embedding for an object
    Output:
        returns a list of size number of rows of embeddings, 
        such that each element is cosine similarity of query and a row in embeddings
    """
    if isinstance(embeddings, dict):
        # change it to numpy array
        emb_keys = list(embeddings.keys())
        embeddings = np.array([embeddings[k] for k in emb_keys])
    similarity = [np.dot(row, query_emb) for row in embeddings]
    
    # sorting values based on the similarity
    emb_keys = [k for _, k in sorted(zip(similarity, emb_keys), reverse=True)]
    similarity = sorted(similarity, reverse=True)

    return emb_keys, similarity

def str2list(s: str) -> list:
    'returns a list of strings breaking the original string by "" '
    return re.findall(r'"(.*?)"', s)

def printkwords(id):

    id = int(id)
    kwords = str2list(df.loc[id, 'Keywords'])
    print('RecipeId:', id)
    print('num keywords:', len(kwords))
    print(kwords)
    

In [57]:
printkwords(45809)

RecipeId: 45809
num keywords: 9
['Chicken', 'Poultry', 'Meat', 'Chinese', 'Asian', 'High Protein', 'High In...', '< 60 Mins', 'Easy']


In [54]:
emb_keys, similarity = calc_similarity(embeddings[str(id)], embeddings)

In [63]:
printkwords(emb_keys[10])

RecipeId: 277167
num keywords: 5
['Poultry', 'Meat', 'High Protein', 'High In...', 'Easy']
