#### spacy_unique_ingredients
Identifies unique ingredients from spaCy-processed data, counts occurrences, and determines a cutoff that filters the dataset to roughly 100 ingredients.
These most common ingredients are saved to `spacy_unique_ingredients.txt` and appended to the LLM system prompt in `process_products.py`.

In [None]:
import ast
from collections import Counter

import pandas as pd

COUNT_CUTOFF = 30000  # ingredients with less than this many occurrences will be discarded

In [None]:
df = pd.read_csv("../data/SpacyProcessing/foodkg_spacy_processed.csv")
df.head(5)

In [12]:
# Ensure they're lists
df["ingredients_list"] = df["processed_ingredients"].apply(
    lambda x: ast.literal_eval(str(x)) if isinstance(x, str) else x
)
df["ingredients_list"].head(5)

0    [brown sugar, milk, vanilla, nut, butter marga...
1    [beef, chicken breast, cream mushroom soup, ca...
2    [corn, cream cheese, butter, garlic, salt, pep...
3    [chicken, chicken gravy, cream mushroom soup, ...
4    [peanut butter, graham cracker crumb, butter, ...
Name: ingredients_list, dtype: object

In [13]:
# Flatten all lists and deduplicate
counter = Counter()
for sublist in df["ingredients_list"]:
    counter.update(sublist)

ingredient_counts = dict(counter.most_common())  # sorted by most common first

In [24]:
filtered = [(item, count) for item, count in counter.most_common() if count > COUNT_CUTOFF]
print(len(filtered))
filtered

103


[('salt', 888745),
 ('sugar', 577971),
 ('egg', 517357),
 ('onion', 487377),
 ('butter', 419128),
 ('garlic', 402863),
 ('water', 396995),
 ('pepper', 341221),
 ('flour', 335132),
 ('milk', 319474),
 ('olive oil', 205232),
 ('vanilla', 198152),
 ('brown sugar', 186386),
 ('cinnamon', 180451),
 ('tomato', 175215),
 ('soda', 163642),
 ('lemon juice', 155174),
 ('black pepper', 142072),
 ('allpurpose flour', 141563),
 ('margarine', 122634),
 ('carrot', 112995),
 ('cream cheese', 111381),
 ('sour cream', 111354),
 ('celery', 110423),
 ('parsley', 109995),
 ('oil', 109153),
 ('vegetable oil', 101458),
 ('beef', 93784),
 ('vanilla extract', 89973),
 ('green onion', 86189),
 ('parmesan cheese', 85009),
 ('mayonnaise', 84026),
 ('pecan', 83316),
 ('cheddar cheese', 81506),
 ('unsalted butter', 79754),
 ('mushroom', 76213),
 ('cream', 74740),
 ('nutmeg', 74198),
 ('kosher salt', 74070),
 ('vinegar', 73783),
 ('green pepper', 72739),
 ('ginger', 72379),
 ('nut', 71890),
 ('basil', 69427),
 ('hon

In [25]:
# Save to a text file, one ingredient per line
ingredients_only = [item for item, count in filtered]
with open("../data/SpacyProcessing/spacy_unique_ingredients.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(ingredients_only))

print(f"Saved {len(ingredients_only)} unique ingredients to unique_ingredients.txt")

Saved 103 unique ingredients to unique_ingredients.txt


## Calculate rough number of tokens of ingredients
This section was used to approximate the number of tokens the unique ingredients would add to the LLM prompt

In [26]:
# # Read ingredients from file
# with open("../SpacyProcessing/spacy_unique_ingredients.txt", "r", encoding="utf-8") as f:
#     unique_ingredients = [line.strip() for line in f if line.strip()]
# # Load spaCy model
# nlp = spacy.load(
#     "en_core_web_sm",
#     disable=["parser", "ner", "tagger", "lemmatizer", "attribute_ruler", "morphologizer"],
# )
# # Count tokens
# total_tokens = 0
# for doc in nlp.pipe(unique_ingredients, batch_size=1000):
#     total_tokens += len(doc)
# print(f"Approximate total token count: {total_tokens:,}")

Approximate total token count: 148
