In [14]:
# TODO: Add usda food ids to table
# Imports.
import pandas as pd
import csv
import re
import ast
import math
import numpy as np
from collections import Counter
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

In [2]:
# Get datasets.
grams = pd.read_csv("../data/grams.csv")
print("Length of USDA portions dataset:", len(grams))
print("")

ingr = pd.read_csv("../data/ingr.csv")
print("Lenght of recipes dataset:", len(ingr))
print("")

Length of USDA portions dataset: 32614

Lenght of recipes dataset: 30925



In [3]:
# Formatted food items from ingredients dataset will be appended to this list.
ts = []

# Food items in the USDA portions dataset will be appended to this list.
comp = []

def _ingr_list_parser(x):
    if "with" not in x:
        return ast.literal_eval(x)

# This while loop iterates through ingredients dataset and formats each food item to resemble the formatting on the USDA dataset.
# Formatted items are appended to ts list.
ts = ingr.food.apply(_ingr_list_parser).tolist()

ts = [np.array(item) for item in ts if item is not None]

ts = np.hstack(ts)


ts = [item for item in ts if type(item) is str]

# Duplicate food items are removed.
ts = list(set(ts))


# This loop iterates through the USDA portions dataset and appends each food item to comp list.
for element in grams.Main_Food_Description:
    comp.append(element)

# Duplicate food items are removed.
comp = list(set(comp))

In [4]:
# A regex pattern is set to be used in text to vector function.
WORD = re.compile(r"\w+")

# This function turns inputted text to vector by counting individual words in it.
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [5]:
# This function gives the cosine similarity score between two vectors.
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [6]:
# Get the most similar usda food name for given food name
def get_cosine_list(foodName):
    maxScore = 0
    maxScore_usda = ""
    for usda in comp:
        cosine = get_cosine(text_to_vector(foodName), text_to_vector(usda))
        if cosine > maxScore:
            maxScore = cosine
            maxScore_usda = usda
    return {
        "Ingr": foodName,
        "Match": maxScore_usda,
        "Score": maxScore
    }


In [7]:
pool = Pool(processes=cpu_count())
results = []
for result in tqdm(pool.imap_unordered(get_cosine_list, ts), total=len(ts)):
    results.append(result)

100%|██████████| 18322/18322 [07:52<00:00, 38.75it/s]


In [9]:
linked_df = pd.DataFrame(results)
linked_df

Unnamed: 0,Ingr,Match,Score
0,Lemons,,0.000000
1,taleggio cheese,"cheese, processed cheese food",0.577350
2,beef filets,beef jerky,0.500000
3,rapid-rise yeast,yeast,0.577350
4,Crab boil,,0.000000
...,...,...,...
18317,peppermint patties,"double hamburger, from fast food, 2 medium pat...",0.250000
18318,whitewine vinegar,vinegar,0.707107
18319,seaweed snacks,seaweed soup,0.500000
18320,mushroom or cheese tortelloni,"cheese, blue or roquefort",0.500000


In [10]:
linked_df.to_csv("../data/linked_df.csv", index=False)

In [13]:
linked_df.drop(linked_df[linked_df.Match == ""].index,).to_csv("../data/linked_df_dropped.csv", index=False)