In [3]:
import ast

import pandas as pd
import spacy

In [4]:
df = pd.read_csv("../data/SpacyProcessing/v1/foodkg_spacy_processed.csv")
df.head(5)

Unnamed: 0,recipe_id,original_ingredients,processed_ingredients
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","['brown sugar', 'milk', 'vanilla', 'nut', 'but..."
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","['beef', 'chicken breast', 'cream mushroom sou..."
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","['corn', 'cream cheese', 'butter', 'garlic', '..."
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","['chicken', 'chicken gravy', 'cream mushroom s..."
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","['peanut butter', 'graham cracker crumb', 'but..."


In [5]:
# Ensure they're lists
df["ingredients_list"] = df["processed_ingredients"].apply(
    lambda x: ast.literal_eval(str(x)) if isinstance(x, str) else x
)
df["ingredients_list"].head(5)

0    [brown sugar, milk, vanilla, nut, butter marga...
1    [beef, chicken breast, cream mushroom soup, ca...
2    [corn, cream cheese, butter, garlic, salt, pep...
3    [chicken, chicken gravy, cream mushroom soup, ...
4    [peanut butter, graham cracker crumb, butter, ...
Name: ingredients_list, dtype: object

In [6]:
# Flatten all lists and deduplicate
ingredients_set = set()
for sublist in df["ingredients_list"]:
    for item in sublist:
        ingredients_set.add(item)
unique_ingredients = sorted(ingredients_set)  # list of sorted ingredients

In [7]:
# Save to a text file, one ingredient per line
with open("spacy_unique_ingredients.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(unique_ingredients))

print(f"Saved {len(unique_ingredients)} unique ingredients to unique_ingredients.txt")

Saved 430310 unique ingredients to unique_ingredients.txt


## Calculate rought number of tokens of ingredients

In [8]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "tagger"])
total_tokens = 0
for doc in nlp.pipe(unique_ingredients, batch_size=1000):
    total_tokens += len(doc)

print(f"Approximate total token count: {total_tokens:,}")



Approximate total token count: 1,249,649
