Goal: separate the ingrdients in the FoodKG table into a list of ingredients

In [20]:
import json
import os

import pandas as pd
from dotenv import load_dotenv

load_dotenv()  # Take environment variables from .env
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI

client = OpenAI(
    api_key=api_key
)

In [21]:
food_kg_df = pd.read_csv('clean/nourish_public_FoodKG.csv').dropna(axis=1)
food_kg_df.columns.to_list()

['id', 'title', 'ingredients', 'directions', 'link', 'source', 'ner']

In [22]:
ingredients = food_kg_df['ner']
ingredients.head(5)

0    {potatoes,purple onion,white wine vinegar,chiv...
1    {rotini,Italian salad mix,Italian dressing,cuc...
2    {yams,egg,onion,tomato,green chile,bread crumb...
3       {pork chops,apples,brown sugar,cinnamon,flour}
4    {linguini,oil,peanut butter,chili powder,Tabas...
Name: ner, dtype: object

In [23]:
# Load Prompts
with open("prompts/prompt_ingredients.txt", "r") as f:
    SYSTEM_MSG_INGREDIENTS = f.read()

In [24]:
def _deduplicate_preserve_order(items):
    # Post processing step to remove duplicate entries
    # ["salt", "salt", "SALT"] => ["salt"]
    seen, out = set(), []
    for x in items:
        x = (x or "").strip().lower()
        if x and x not in seen:
            seen.add(x)
            out.append(x)
    return out


def extract_ingredients(description):
    # guard for NaN
    if description is None:
        return []
    # guard for empty
    text = str(description).strip()
    if not text:
        return []
    # reinforce JSON-only on the user message
    user_msg = f"""Input: {text}
Return ONLY a JSON array of deduplicated base ingredients in lowercase, e.g. ["chicken","butter"]."""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": SYSTEM_MSG_INGREDIENTS},
            {"role": "user", "content": user_msg},
        ],
        max_tokens=400,
        temperature=0,
    )
    content = response.choices[0].message.content
    # try to parse; try to repair with one retry if not valid JSON
    try:
        parsed = json.loads(content)
    except json.JSONDecodeError:
        repair_msg = f"""Your previous response was not valid JSON.
Return ONLY a JSON array of strings, no explanation.

Input: {text}"""
        response2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYSTEM_MSG_INGREDIENTS},
                {"role": "user", "content": repair_msg},
            ],
            max_tokens=400,
            temperature=0,
        )
        content = response2.choices[0].message.content
        # check if it worked
        try:
            parsed = json.loads(content)
        except json.JSONDecodeError:
            # give up gracefully
            return []
    # normalize shape
    if isinstance(parsed, list):
        return _deduplicate_preserve_order([str(x) for x in parsed])
    # if model returned an object (shouldn't for single row), flatten values
    if isinstance(parsed, dict):
        flat = []
        for v in parsed.values():
            if isinstance(v, list):
                flat.extend([str(x) for x in v])
        return _deduplicate_preserve_order(flat)
    return []


# apply function to ingredients column
food_kg_df["ingredients_normalized"] = food_kg_df["ingredients"].apply(extract_ingredients)

In [25]:
# CSV canâ€™t store native Python lists (they are objects), so use JSON dump
ingredients_table = pd.DataFrame({
    "ingredients_raw": food_kg_df["ingredients"],
    "ingredients_normalized": food_kg_df["ingredients_normalized"].apply(json.dumps),
    # "ner": food_kg_df["ner"],
})
# Create directory if it doesn't exist and save
os.makedirs("ingredients", exist_ok=True)
ingredients_table.to_csv("ingredients/ingredients.csv", index=False)

In [26]:
ingredients_table.head(5)

Unnamed: 0,ingredients_raw,ingredients_normalized
0,"{2 c. cubed red potatoes,2 Tbsp. finely choppe...","[""potato"", ""onion"", ""vinegar"", ""chive"", ""olive..."
1,"{12 oz. pkg. rotini,6 oz. pkg. Good Seasons Ze...","[""rotini"", ""salad mix"", ""italian dressing"", ""c..."
2,"{1 1/2 lb. yams,1 egg,1 chopped onion,1 choppe...","[""yam"", ""egg"", ""onion"", ""tomato"", ""green chile..."
3,"{pork chops,apples,brown sugar,cinnamon,""flour...","[""apples"", ""brown sugar"", ""cinnamon"", ""flour"",..."
4,"{1 lb. linguini,2 Tbsp. oil,3 Tbsp. peanut but...","[""linguini"", ""oil"", ""peanut butter"", ""chili po..."


In [27]:
# Test load to get Python lists
df = pd.read_csv("ingredients/ingredients.csv")
df["ingredients_normalized"] = df["ingredients_normalized"].apply(json.loads)
df.head()

Unnamed: 0,ingredients_raw,ingredients_normalized
0,"{2 c. cubed red potatoes,2 Tbsp. finely choppe...","[potato, onion, vinegar, chive, olive oil, sug..."
1,"{12 oz. pkg. rotini,6 oz. pkg. Good Seasons Ze...","[rotini, salad mix, italian dressing, cucumber..."
2,"{1 1/2 lb. yams,1 egg,1 chopped onion,1 choppe...","[yam, egg, onion, tomato, green chile, bread c..."
3,"{pork chops,apples,brown sugar,cinnamon,""flour...","[apples, brown sugar, cinnamon, flour, pork, s..."
4,"{1 lb. linguini,2 Tbsp. oil,3 Tbsp. peanut but...","[linguini, oil, peanut butter, chili powder, t..."
