Goal: separate the ingrdients in the FoodKG table into a list of ingredients

In [16]:
import os

import pandas as pd
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI

client = OpenAI(
    api_key=api_key
)

In [13]:
food_kg_df = pd.read_csv('clean/nourish_public_FoodKG.csv').dropna(axis=1)
food_kg_df.columns.to_list()

['id', 'title', 'ingredients', 'directions', 'link', 'source', 'ner']

In [14]:
ingredients = food_kg_df['ingredients']
ingredients.head()

0    {2 c. cubed red potatoes,2 Tbsp. finely choppe...
1    {12 oz. pkg. rotini,6 oz. pkg. Good Seasons Ze...
2    {1 1/2 lb. yams,1 egg,1 chopped onion,1 choppe...
3    {pork chops,apples,brown sugar,cinnamon,"flour...
4    {1 lb. linguini,2 Tbsp. oil,3 Tbsp. peanut but...
Name: ingredients, dtype: object

In [17]:
import json
import pandas as pd

# keep your SYSTEM_MSG as given
SYSTEM_MSG = """You are a culinary data annotator. Convert free-text ingredient strings into a deduplicated list of base ingredients.

Normalization rules:
- Keep only ingredient names (no quantities, units, sizes, brands, preparation steps, or descriptors).
- Singular, lowercase (e.g., "tomatoes" → "tomato").
- Strip vague phrases like "to taste", "as needed".
- Collapse synonyms to common pantry terms (e.g., "margarine" → "butter", "self-rising flour" → "flour").
- If an item is a compound product (e.g., "cream of chicken soup"), keep the product name ("chicken soup" is acceptable; do not decompose into sub-ingredients).
- Deduplicate.

Output format:
Return ONLY valid JSON. No prose. No code fences.
For a single input: ["ingredient1", "ingredient2", ...]
For batched inputs keyed by id: {"<id1>":[...], "<id2>":[...]}
"""


def _dedupe_preserve_order(items):
    seen, out = set(), []
    for x in items:
        x = (x or "").strip().lower()
        if x and x not in seen:
            seen.add(x)
            out.append(x)
    return out


def extract_ingredients(description):
    # guard for NaN/empty
    if description is None:
        return []
    text = str(description).strip()
    if not text:
        return []

    # reinforce JSON-only on the user message
    user_msg = f"""Input: {text}
Return ONLY a JSON array of deduplicated base ingredients in lowercase, e.g. ["chicken","butter"]."""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": SYSTEM_MSG},
            {"role": "user", "content": user_msg},
        ],
        max_tokens=400,
        temperature=0,
    )
    content = response.choices[0].message.content

    # try to parse; minimal repair retry if not valid JSON
    try:
        parsed = json.loads(content)
    except json.JSONDecodeError:
        repair_msg = f"""Your previous response was not valid JSON.
Return ONLY a JSON array of strings, no explanation.

Input: {text}"""
        response2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYSTEM_MSG},
                {"role": "user", "content": repair_msg},
            ],
            max_tokens=400,
            temperature=0,
        )
        content = response2.choices[0].message.content
        try:
            parsed = json.loads(content)
        except json.JSONDecodeError:
            # give up gracefully
            return []

    # normalize shape
    if isinstance(parsed, list):
        return _dedupe_preserve_order([str(x) for x in parsed])
    # if model returned an object (shouldn't for single row), flatten values
    if isinstance(parsed, dict):
        flat = []
        for v in parsed.values():
            if isinstance(v, list):
                flat.extend([str(x) for x in v])
        return _dedupe_preserve_order(flat)
    return []


# ---- apply to your df ----
# expects a df named food_kg_df with a column "ingredients"
food_kg_df["ingredients_normalized"] = food_kg_df["ingredients"].apply(extract_ingredients)

# save: lists as JSON strings in CSV
ingredients_table = pd.DataFrame({
    "ingredients_raw": food_kg_df["ingredients"],
    "ingredients_normalized": food_kg_df["ingredients_normalized"].apply(json.dumps),
})
ingredients_table.to_csv("ingredients.csv", index=False)


In [20]:
ingredients_table.head(5)

Unnamed: 0,ingredients_raw,ingredients_normalized
0,"{2 c. cubed red potatoes,2 Tbsp. finely choppe...","[""potato"", ""onion"", ""vinegar"", ""chive"", ""olive..."
1,"{12 oz. pkg. rotini,6 oz. pkg. Good Seasons Ze...","[""rotini"", ""salad mix"", ""dressing"", ""cucumber""..."
2,"{1 1/2 lb. yams,1 egg,1 chopped onion,1 choppe...","[""yam"", ""egg"", ""onion"", ""tomato"", ""green chile..."
3,"{pork chops,apples,brown sugar,cinnamon,""flour...","[""apples"", ""brown sugar"", ""cinnamon"", ""flour"",..."
4,"{1 lb. linguini,2 Tbsp. oil,3 Tbsp. peanut but...","[""linguini"", ""oil"", ""peanut butter"", ""chili po..."
