Goal: separate the ingrdients in the FoodKG table into a list of ingredients

In [1]:
import json
import os

import pandas as pd
from dotenv import load_dotenv

load_dotenv()  # Take environment variables from .env
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI

client = OpenAI(
    api_key=api_key
)

In [2]:
food_kg_df = pd.read_csv('clean/nourish_public_FoodKG.csv').dropna(axis=1)
food_kg_df.columns.to_list()

['id', 'title', 'ingredients', 'directions', 'link', 'source', 'ner']

In [3]:
ingredients = food_kg_df['ingredients']
ingredients.head(5)

0    {2 c. cubed red potatoes,2 Tbsp. finely choppe...
1    {12 oz. pkg. rotini,6 oz. pkg. Good Seasons Ze...
2    {1 1/2 lb. yams,1 egg,1 chopped onion,1 choppe...
3    {pork chops,apples,brown sugar,cinnamon,"flour...
4    {1 lb. linguini,2 Tbsp. oil,3 Tbsp. peanut but...
Name: ingredients, dtype: object

In [4]:
SYSTEM_MSG = """You are a culinary data annotator. Convert free-text ingredient strings into a deduplicated list of base ingredients.

Normalization rules:
- Keep only ingredient names (no quantities, units, sizes, brands, preparation steps, or descriptors).
- Singular, lowercase (e.g., "tomatoes" → "tomato").
- Strip vague phrases like "to taste", "as needed".
- Collapse obvious synonyms to common pantry terms (e.g., "margarine" → "butter", "self-rising flour" → "flour").
- Preserve distinct *product variants* when the modifier changes the flavor type or product identity (e.g., "ranch dressing", "italian dressing", "soy sauce", "beef broth", "smoked paprika").
- But IGNORE purely descriptive or marketing adjectives such as “zesty”, “creamy”, “original”, “classic”, “deluxe”, “fancy”, “reduced-fat”, etc.
- If an item is a compound product (e.g., "cream of chicken soup"), keep the product name intact ("cream of chicken soup").
- Deduplicate. Output the canonical short form that captures the true flavor/type distinction but omits marketing fluff.

Output format:
Return ONLY valid JSON. No prose. No code fences.
For a single input: ["ingredient1", "ingredient2", ...]
"""


In [5]:
def _deduplicate_preserve_order(items):
    # Post processing step to remove duplicate entries
    # ["salt", "salt", "SALT"] => ["salt"]
    seen, out = set(), []
    for x in items:
        x = (x or "").strip().lower()
        if x and x not in seen:
            seen.add(x)
            out.append(x)
    return out


def extract_ingredients(description):
    # guard for NaN
    if description is None:
        return []
    # guard for empty
    text = str(description).strip()
    if not text:
        return []

    # reinforce JSON-only on the user message
    user_msg = f"""Input: {text}
Return ONLY a JSON array of deduplicated base ingredients in lowercase, e.g. ["chicken","butter"]."""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": SYSTEM_MSG},
            {"role": "user", "content": user_msg},
        ],
        max_tokens=400,
        temperature=0,
    )
    content = response.choices[0].message.content

    # try to parse; try to repair with one retry if not valid JSON
    try:
        parsed = json.loads(content)
    except json.JSONDecodeError:
        repair_msg = f"""Your previous response was not valid JSON.
Return ONLY a JSON array of strings, no explanation.

Input: {text}"""
        response2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYSTEM_MSG},
                {"role": "user", "content": repair_msg},
            ],
            max_tokens=400,
            temperature=0,
        )
        content = response2.choices[0].message.content
        # check if it worked
        try:
            parsed = json.loads(content)
        except json.JSONDecodeError:
            # give up gracefully
            return []

    # normalize shape
    if isinstance(parsed, list):
        return _deduplicate_preserve_order([str(x) for x in parsed])
    # if model returned an object (shouldn't for single row), flatten values
    if isinstance(parsed, dict):
        flat = []
        for v in parsed.values():
            if isinstance(v, list):
                flat.extend([str(x) for x in v])
        return _deduplicate_preserve_order(flat)
    return []


# apply function to ingredients column
food_kg_df["ingredients_normalized"] = food_kg_df["ingredients"].apply(extract_ingredients)

In [6]:
# CSV can’t store native Python lists (they are objects), so use JSON dump
ingredients_table = pd.DataFrame({
    "ingredients_raw": food_kg_df["ingredients"],
    "ingredients_normalized": food_kg_df["ingredients_normalized"].apply(json.dumps),
})
# Create directory if it doesn't exist and save
os.makedirs("ingredients", exist_ok=True)
ingredients_table.to_csv("ingredients/ingredients.csv", index=False)

In [7]:
ingredients_table.head(5)

Unnamed: 0,ingredients_raw,ingredients_normalized
0,"{2 c. cubed red potatoes,2 Tbsp. finely choppe...","[""potato"", ""onion"", ""vinegar"", ""chive"", ""olive..."
1,"{12 oz. pkg. rotini,6 oz. pkg. Good Seasons Ze...","[""rotini"", ""italian dressing"", ""cucumber"", ""to..."
2,"{1 1/2 lb. yams,1 egg,1 chopped onion,1 choppe...","[""yam"", ""egg"", ""onion"", ""tomato"", ""green chile..."
3,"{pork chops,apples,brown sugar,cinnamon,""flour...","[""pork"", ""apple"", ""brown sugar"", ""cinnamon"", ""..."
4,"{1 lb. linguini,2 Tbsp. oil,3 Tbsp. peanut but...","[""linguini"", ""oil"", ""peanut butter"", ""chili po..."


In [8]:
# Test load to get Python lists
df = pd.read_csv("ingredients/ingredients.csv")
df["ingredients_normalized"] = df["ingredients_normalized"].apply(json.loads)
df.head()

Unnamed: 0,ingredients_raw,ingredients_normalized
0,"{2 c. cubed red potatoes,2 Tbsp. finely choppe...","[potato, onion, vinegar, chive, olive oil, sug..."
1,"{12 oz. pkg. rotini,6 oz. pkg. Good Seasons Ze...","[rotini, italian dressing, cucumber, tomato, b..."
2,"{1 1/2 lb. yams,1 egg,1 chopped onion,1 choppe...","[yam, egg, onion, tomato, green chile, bread c..."
3,"{pork chops,apples,brown sugar,cinnamon,""flour...","[pork, apple, brown sugar, cinnamon, flour, sa..."
4,"{1 lb. linguini,2 Tbsp. oil,3 Tbsp. peanut but...","[linguini, oil, peanut butter, chili powder, t..."
