**Author :  Ong Cheng Kei TP055620** <br>
**Description :**
<br>This file contains code to generate text embeddings for similarity calculation of ingredients name in two database : Nutrition5k, USDA-FNDDS.<br>This module exposes some function that can be called to get the most similar ingredient in both database

In [7]:
from pathlib import Path
from pprint import pprint
from types import SimpleNamespace

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from absl import logging
from tensorflow import keras

In [79]:
root_dir = Path("/School Materials/FoodNet")
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)


def embed(input):
    return model(input)


if __name__ == "__main__":
    print("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [80]:
fndds_dir = root_dir / "Food Datasets" / "USDA-FNDDS"
with open((fndds_dir / "cleaned_food_category.txt").resolve(strict=True), "r") as file:
    fndds_category = file.read().split("\n")
    fndds_category.pop(-1)  # remove last line (empty)

In [81]:
if __name__ == "__main__":
    pprint(fndds_category)

['other sandwiches',
 'spinach',
 'apples',
 'cheese sandwiches',
 'eggs and omelets',
 'bean and pea and legume dishes',
 'grapes',
 'sausages',
 'cookies and brownies',
 'vegetables on a sandwich',
 'mustard and other condiments',
 'strawberries',
 'corn',
 'reduced fat flavored milk',
 'potato chips',
 'pancakes and waffles and french toast',
 'fried rice and lo mein or chow mein',
 'lowfat flavored milk',
 'melons',
 'pizza',
 'processed soy products',
 'pretzels or snack mix',
 'cream cheese and sour cream and whipped cream',
 'bacon',
 'cream and cream substitutes',
 'onions',
 'dips and gravies and other sauces',
 'peanut butter and jelly sandwiches',
 'pork',
 'nuts and seeds',
 'tortillas',
 'other dark green vegetables',
 'peaches and nectarines',
 'vegetable dishes',
 'chicken in whole pieces',
 'frankfurter sandwiches',
 'whole milk',
 'cheese',
 'poultry mixed dishes',
 'biscuits and muffins and quick breads',
 'doughnuts and sweet rolls and pastries',
 'other fruits and f

In [82]:
df_fndds_nutrient_values = pd.read_csv(
    (fndds_dir / "cleaned_fndds_nutrient_values.csv").resolve(strict=True), sep="\t"
)

In [83]:
if __name__ == "__main__":
    display(df_fndds_nutrient_values)

Unnamed: 0,Main food description,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),Total Fat (g)
0,"milk, not further specified",reduced fat milk,0.51,0.0334,0.0487,0.0199
1,"milk, whole",whole milk,0.60,0.0328,0.0467,0.0320
2,"milk, low sodium, whole",whole milk,0.61,0.0310,0.0446,0.0346
3,"milk, calcium fortified, whole",whole milk,0.60,0.0328,0.0467,0.0320
4,"milk, calcium fortified, low fat",lowfat milk,0.43,0.0338,0.0519,0.0095
...,...,...,...,...,...,...
5808,"wine, light",wine,0.49,0.0007,0.0117,0.0000
5809,wine cooler,wine,0.68,0.0000,0.1007,0.0000
5810,"sangria, red",wine,0.96,0.0004,0.0826,0.0004
5811,"sangria, white",wine,0.94,0.0004,0.0826,0.0004


In [84]:
nutrition5k_dir = root_dir / "Food Datasets" / "nutrition5k"
df_nutrition5k_nutrient_values = pd.read_csv(
    (nutrition5k_dir / "metadata" / "ingredients_metadata.csv").resolve(strict=True)
)

In [85]:
if __name__ == "__main__":
    display(df_nutrition5k_nutrient_values.head())

Unnamed: 0,ingr,id,cal/g,fat(g),carb(g),protein(g)
0,cottage cheese,1,0.98,0.043,0.034,0.11
1,strawberries,2,0.33,0.003,0.08,0.007
2,garden salad,3,0.646,0.034,0.032,0.061
3,bacon,4,5.41,0.42,0.014,0.37
4,potatoes,5,0.77,0.001,0.17,0.02


In [86]:
fndds_category_embeddings = embed(fndds_category)

In [87]:
fndds_description_embeddings = {}
fndds_description = {}
for category in fndds_category:
    all_food_in_category = df_fndds_nutrient_values[
        df_fndds_nutrient_values["WWEIA Category description"] == category
    ]["Main food description"].tolist()
    fndds_description_embeddings[category] = embed(all_food_in_category)
    fndds_description[category] = all_food_in_category

In [88]:
nutrition5k_ingredient = df_nutrition5k_nutrient_values["ingr"].tolist()
nutrition5k_ingredient_embeddings = embed(nutrition5k_ingredient)

In [89]:
if __name__ == "__main__":
    print("Below is an overview of Nutrition5k ingredient embeddings\n")
    pprint(nutrition5k_ingredient_embeddings)

Below is an overview of Nutrition5k ingredient embeddings

<tf.Tensor: shape=(555, 512), dtype=float32, numpy=
array([[-0.04213651,  0.02428615,  0.03386059, ...,  0.02607825,
         0.02512663, -0.05266538],
       [ 0.01148291,  0.00819834,  0.03557928, ...,  0.00613294,
         0.03251711, -0.06750827],
       [-0.01347014, -0.01774764,  0.01833388, ..., -0.04396154,
         0.0723353 , -0.0649578 ],
       ...,
       [ 0.04466048,  0.01656266,  0.02949798, ...,  0.00699597,
        -0.03023825, -0.07513158],
       [ 0.03270737, -0.01751684,  0.00517269, ..., -0.03976529,
        -0.01495524, -0.07337392],
       [-0.05237373, -0.04956251,  0.03297004, ..., -0.04588483,
        -0.04796786, -0.0534832 ]], dtype=float32)>


In [90]:
if __name__ == "__main__":
    print("Below is an overview of FNDDS category embeddings\n")
    pprint(fndds_category_embeddings)

Below is an overview of FNDDS category embeddings

<tf.Tensor: shape=(128, 512), dtype=float32, numpy=
array([[-0.02977386,  0.02504751, -0.00446226, ...,  0.01189842,
         0.05323031, -0.05012281],
       [ 0.01926548, -0.02953166,  0.03697063, ..., -0.03237338,
         0.05260122, -0.06955921],
       [-0.02790156,  0.04283864,  0.02754697, ...,  0.02965838,
        -0.0168196 , -0.03757589],
       ...,
       [-0.00339281,  0.01031392,  0.02377785, ...,  0.02528976,
         0.05466637, -0.04295544],
       [ 0.01500979,  0.00660691, -0.02712986, ...,  0.03685194,
         0.06616919, -0.03567617],
       [ 0.0076906 , -0.02501074,  0.02715279, ...,  0.05840044,
         0.06280483, -0.07194165]], dtype=float32)>


In [1]:
def get_cosine_similarity(matrix_embedding, target_vector_embedding):
    # compute the inner product
    similarity = tf.linalg.matvec(matrix_embedding, target_vector_embedding)
    return similarity


def get_most_similar_from_nutrition5k(target_vector_embedding):
    similarity = get_cosine_similarity(
        nutrition5k_ingredient_embeddings, target_vector_embedding
    )
    value, index = tf.math.top_k(similarity, k=1)
    value = tf.reshape(
        value,
        [
            1,
        ],
    )[0].numpy()
    index = tf.reshape(
        index,
        [
            1,
        ],
    )[0].numpy()
    return (value, index)


def get_most_similar_from_fndds(target_vector_embedding):
    category_similarity = get_cosine_similarity(
        fndds_category_embeddings, target_vector_embedding
    )
    category_value, category_index = tf.math.top_k(category_similarity, k=1)
    category_index = tf.reshape(
        category_index,
        [
            1,
        ],
    )[0].numpy()
    category = get_category_from_fndds(category_index)
    similarity = get_cosine_similarity(
        fndds_description_embeddings[category], target_vector_embedding
    )
    value, index = tf.math.top_k(similarity, k=1)
    value = tf.reshape(
        value,
        [
            1,
        ],
    )[0].numpy()
    index = tf.reshape(
        index,
        [
            1,
        ],
    )[0].numpy()
    return (value, index, category)


def get_ingredient_nutrient_from_nutrition5k(index):
    ingredient_name = nutrition5k_ingredient[index]
    result = df_nutrition5k_nutrient_values.loc[
        df_nutrition5k_nutrient_values["ingr"] == ingredient_name, :
    ]
    return result.squeeze()


def get_category_from_fndds(index):
    return fndds_category[index]


def get_ingredient_nutrient_from_fndds(category, index):
    ingredient_name = fndds_description[category][index]
    result = df_fndds_nutrient_values.loc[
        df_fndds_nutrient_values["Main food description"] == ingredient_name, :
    ]
    return result.squeeze()

In [5]:
exported = {
    fn.__name__: fn
    for fn in [
        get_ingredient_nutrient_from_fndds,
        get_ingredient_nutrient_from_nutrition5k,
        get_most_similar_from_fndds,
        get_most_similar_from_nutrition5k,
        embed,
    ]
}

In [9]:
exported = SimpleNamespace(**exported)

In [92]:
if __name__ != "__main__":
    print("Module ingredient_embeddings_similarity.ipynb is loaded")