**Author :  Ong Cheng Kei TP055620** <br>
**Description :**
<br>This file contains code to generate text embeddings for similarity calculation of ingredients name in two database : Nutrition5k, USDA-FNDDS.<br>This module exposes some function that can be called to get the most similar ingredient in both database

In [1]:
from pathlib import Path
from pprint import pprint
from types import SimpleNamespace
from IPython.display import display
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

In [2]:
# Setting root directory to FoodNet
dir_parents = Path.cwd().parents
x = 0
root_dir = dir_parents[x]
while root_dir.name != "FoodNet" and x < len(dir_parents):
    x += 1
    root_dir = dir_parents[x]
assert (
    root_dir.name == "FoodNet"
), "Unable to find FoodNet root directory. Please change the root directory or set the working directory under the FoodNet root directory."

### Google Universal Sentence Encoder for word embeddings

In [3]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)


def embed(input):
    return model(input)


if __name__ == "__main__":
    print("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


### Create embeddings for all items in nutrition database

In [4]:
if __name__ == "__main__":
    display(embed(["poultry and chicken", "i love milk"]))

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[ 0.00842835,  0.0064201 ,  0.05519093, ...,  0.00923467,
         0.04580159, -0.04145261],
       [ 0.0491173 , -0.00836498,  0.02530873, ..., -0.03663804,
         0.028909  , -0.07283901]], dtype=float32)>

In [5]:
fndds_dir = root_dir / "Food Datasets" / "USDA-FNDDS"
with open((fndds_dir / "cleaned_food_category.txt").resolve(strict=True), "r") as file:
    fndds_category = file.read().split("\n")
    fndds_category.pop(-1)  # remove last line (empty)

In [6]:
if __name__ == "__main__":
    pprint(fndds_category)

['dips and gravies and other sauces',
 'meat mixed dishes',
 'tomato-based pasta sauces',
 'nonfat flavored milk',
 'egg rolls and dumplings and sushi',
 'mustard and other condiments',
 'cheese sandwiches',
 'pretzels or snack mix',
 'ice cream and frozen dairy desserts',
 'vegetable juice',
 'tortilla and corn and other chips',
 'other mexican mixed dishes',
 'candy not containing chocolate',
 'cream cheese and sour cream and whipped cream',
 'soy-based condiments',
 'olives and pickles and pickled vegetables',
 'fish',
 'reduced fat milk',
 'lowfat milk',
 'milk substitutes',
 'bananas',
 'poultry mixed dishes',
 'burritos and tacos',
 'pasta and noodles and cooked grains',
 'salad dressings and vegetable oils',
 'fried vegetables',
 'soft drinks',
 'beef but excludes ground',
 'gelatins and ices and sorbets',
 'sausages',
 'cabbage',
 'cakes and pies',
 'processed soy products',
 'chicken in whole pieces',
 'nonfat milk',
 'cheese',
 'corn',
 'macaroni and cheese',
 'margarine',
 '

In [7]:
df_fndds_nutrient_values = pd.read_csv(
    (fndds_dir / "cleaned_fndds_nutrient_values.csv").resolve(strict=True), sep="\t"
)

In [8]:
if __name__ == "__main__":
    display(df_fndds_nutrient_values)

Unnamed: 0,Main food description,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),Total Fat (g)
0,"milk, not further specified",reduced fat milk,0.51,0.0334,0.0487,0.0199
1,"milk, whole",whole milk,0.60,0.0328,0.0467,0.0320
2,"milk, low sodium, whole",whole milk,0.61,0.0310,0.0446,0.0346
3,"milk, calcium fortified, whole",whole milk,0.60,0.0328,0.0467,0.0320
4,"milk, calcium fortified, low fat",lowfat milk,0.43,0.0338,0.0519,0.0095
...,...,...,...,...,...,...
6085,gin,liquor and cocktails,2.31,0.0000,0.0000,0.0000
6086,rum,liquor and cocktails,2.31,0.0000,0.0000,0.0000
6087,rum cooler,liquor and cocktails,0.68,0.0000,0.1007,0.0000
6088,vodka,liquor and cocktails,2.31,0.0000,0.0000,0.0000


In [9]:
nutrition5k_dir = root_dir / "Food Datasets" / "nutrition5k"
df_nutrition5k_nutrient_values = pd.read_csv(
    (nutrition5k_dir / "metadata" / "ingredients_metadata.csv").resolve(strict=True)
)

In [10]:
if __name__ == "__main__":
    display(df_nutrition5k_nutrient_values.head())

Unnamed: 0,ingr,id,cal/g,fat(g),carb(g),protein(g)
0,cottage cheese,1,0.98,0.043,0.034,0.11
1,strawberries,2,0.33,0.003,0.08,0.007
2,garden salad,3,0.646,0.034,0.032,0.061
3,bacon,4,5.41,0.42,0.014,0.37
4,potatoes,5,0.77,0.001,0.17,0.02


In [14]:
fndds_category_embeddings = embed(fndds_category)

In [15]:
fndds_description_embeddings = {}
fndds_description = {}
for category in fndds_category:
    all_food_in_category = df_fndds_nutrient_values[
        df_fndds_nutrient_values["WWEIA Category description"] == category
    ]["Main food description"].tolist()
    fndds_description_embeddings[category] = embed(all_food_in_category)
    fndds_description[category] = all_food_in_category

In [16]:
nutrition5k_ingredient = df_nutrition5k_nutrient_values["ingr"].tolist()
nutrition5k_ingredient_embeddings = embed(nutrition5k_ingredient)

In [17]:
if __name__ == "__main__":
    print("Below is an overview of Nutrition5k ingredient embeddings\n")
    pprint(nutrition5k_ingredient_embeddings)

Below is an overview of Nutrition5k ingredient embeddings

<tf.Tensor: shape=(555, 100), dtype=float32, numpy=
array([[-0.8001881 ,  0.30579656,  0.40404803, ..., -0.71067655,
        -0.18292719, -0.23615468],
       [ 1.4764557 , -0.76835614, -0.2763842 , ...,  0.5071353 ,
        -0.46569014,  0.4369496 ],
       [ 0.2516177 ,  0.30057588,  0.46248987, ...,  0.18631147,
        -0.41534242,  0.27663916],
       ...,
       [-0.9923823 , -0.01477999,  0.6320943 , ..., -1.3864421 ,
        -0.6416989 , -0.56492907],
       [-0.9059994 ,  0.27303943,  1.391297  , ..., -1.4968715 ,
        -0.51653385,  0.25581005],
       [ 0.9568592 ,  0.9946021 ,  0.01179673, ..., -1.1186261 ,
         0.49599853, -0.6433233 ]], dtype=float32)>


In [18]:
if __name__ == "__main__":
    print("Below is an overview of FNDDS category embeddings\n")
    pprint(fndds_category_embeddings)

Below is an overview of FNDDS category embeddings

<tf.Tensor: shape=(134, 100), dtype=float32, numpy=
array([[ 0.0736658 , -0.01528643,  0.04836228, ...,  0.01976526,
         0.10618042,  0.11976459],
       [ 0.09960538, -0.00351433,  0.08277968, ..., -0.01303185,
        -0.04615907,  0.00976438],
       [ 0.03241002,  0.02663724,  0.09724882, ..., -0.00637689,
         0.04267838,  0.12252326],
       ...,
       [ 1.6494421 ,  0.23510456,  0.4491682 , ...,  1.2877258 ,
        -0.2865674 ,  1.1195595 ],
       [ 0.04707468,  0.00412078,  0.07282151, ...,  0.06196108,
         0.10353179,  0.10708564],
       [ 0.8192975 , -0.48433325,  0.89459574, ...,  0.00987275,
         0.51702917,  1.7644773 ]], dtype=float32)>


In [87]:
def get_cosine_similarity(matrix_embedding, target_vector_embedding):
    # compute the inner product
    similarity = tf.linalg.matvec(matrix_embedding, target_vector_embedding)
    return similarity


def get_most_similar_from_nutrition5k(target_vector_embedding):
    similarity = get_cosine_similarity(
        nutrition5k_ingredient_embeddings, target_vector_embedding
    )
    value, index = tf.math.top_k(similarity, k=1)
    value = tf.reshape(value, [1])[0].numpy()
    index = tf.reshape(index, [1])[0].numpy()
    return (value, index)


def get_most_similar_from_fndds(target_vector_embedding):
    category_similarity = get_cosine_similarity(
        fndds_category_embeddings, target_vector_embedding
    )
    category_value, category_index = tf.math.top_k(category_similarity, k=1)
    category_index = tf.reshape(category_index, [1])[0].numpy()
    category = get_category_from_fndds(category_index)
    similarity = get_cosine_similarity(
        fndds_description_embeddings[category], target_vector_embedding
    )
    value, index = tf.math.top_k(similarity, k=1)
    value = tf.reshape(value, [1])[0].numpy()
    index = tf.reshape(index, [1])[0].numpy()
    return (value, index, category)


def get_ingredient_nutrient_from_nutrition5k(index):
    ingredient_name = nutrition5k_ingredient[index]
    result = df_nutrition5k_nutrient_values.loc[
        df_nutrition5k_nutrient_values["ingr"] == ingredient_name, :
    ]
    return result.squeeze()


def get_category_from_fndds(index):
    return fndds_category[index]


def get_ingredient_nutrient_from_fndds(category, index):
    ingredient_name = fndds_description[category][index]
    result = df_fndds_nutrient_values.loc[
        df_fndds_nutrient_values["Main food description"] == ingredient_name, :
    ]
    return result.squeeze()

In [88]:
exported = {
    fn.__name__: fn
    for fn in [
        get_ingredient_nutrient_from_fndds,
        get_ingredient_nutrient_from_nutrition5k,
        get_most_similar_from_fndds,
        get_most_similar_from_nutrition5k,
        embed,
    ]
}

In [89]:
exported = SimpleNamespace(**exported)

In [90]:
if __name__ != "__main__":
    print("Module ingredient_embeddings_similarity.ipynb is loaded")