Goal: separate the ingrdients in the FoodKG table into a list of ingredients

In [1]:
!pip install --upgrade pip -q
!pip install psycopg2-binary -q
!pip install SQLAlchemy -q
!pip install spacy -q
!python -m spacy download en_core_web_sm -q

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import json
import os

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

from util import process_foodkg_in_batches

load_dotenv()  # Take environment variables from .env
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI

client = OpenAI(
    api_key=api_key
)

In [3]:
# Credentials
host = os.getenv("POSTGRES_HOST")
port = os.getenv("POSTGRES_PORT")
database = os.getenv("POSTGRES_DATABASE")
user = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
# Create the connection engine
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}")
# Test the connection
try:
    with engine.connect() as conn:
        print("✅ Connected successfully!")
except Exception as e:
    print("❌ Connection failed:", e)

✅ Connected successfully!


In [4]:
# Test: query 5 rows
query = "SELECT * FROM \"FoodKG\" LIMIT 5;"  # quotes protect uppercase table names
df = pd.read_sql(query, engine)
# display nicely
pd.set_option('display.max_columns', None)  # show all columns
pd.set_option('display.max_rows', 5)  # show up to 5 rows
display(df)

Unnamed: 0,id,title,ingredients,directions,link,source,ner
0,2229479,Double Chocolate Chewy Cookies,"[3/4 cup softened salted butter, 1 1/2 cup sug...","[Heat oven to 350F., Cream butter and sugar., ...",cookpad.com/us/recipes/339167-double-chocolate...,Recipes1M,"[butter, sugar, eggs, vanilla, flour, cocoa, b..."
1,2229480,"""Old Fashioned Potato Soup - ""Homemade"" Potato...","[6 Potatoes, peeled, cubes, 2 Leeks, washed, c...",[Put all ingredients except milk and chives in...,online-cookbook.com/goto/cook/rpage/0003A6,Recipes1M,"[Potatoes, Leeks, Onions, Ham bone, Carrot, Ce..."
2,2229492,Persian pasta with spiced chickpeas and pistac...,"[250 g (8.8oz) spaghetti, 1 tin of chickpeas, ...","[Boil a large pot of water for your pasta., Di...",www.lovefood.com/guide/recipes/17869/persian-p...,Recipes1M,"[chickpeas, onion, tomatoes, clove of garlic, ..."
3,2229494,Shawn & Lauras Kitchen Sink Feijoada,"[1 12 tablespoons garlic (chopped), 1 shallot,...","[Chop garlic, shallot, green onions, parsley &...",www.food.com/recipe/shawn-lauras-kitchen-sink-...,Recipes1M,"[garlic, shallot, Italian parsley, oregano, ba..."
4,2229498,The 4:2:1 Golden Ratio for Always Delicious Ta...,"[4 Eggs (large), 2 tbsp Sugar, 1 tsp Salt, 1 t...","[Break the eggs into a bowl., Put all the ingr...",cookpad.com/us/recipes/145525-the-421-golden-r...,Recipes1M,"[Eggs, Sugar, Salt, Mayonnaise]"


In [5]:
process_foodkg_in_batches(engine, batch_size=1000, CUTOFF=1000, process_fn=lambda: print("processing"))

TypeError: <lambda>() takes 0 positional arguments but 1 was given

In [5]:
bifff

NameError: name 'bifff' is not defined

In [None]:
food_kg_df = pd.read_csv('clean/nourish_public_FoodKG.csv').dropna(axis=1)
food_kg_df.columns.to_list()

In [None]:
# Load System Prompt
with open("../prompts/system_message_ingredients.txt", "r") as f:
    SYSTEM_MSG_INGREDIENTS = f.read()

In [None]:
def _deduplicate_preserve_order(items):
    # Post processing step to remove duplicate entries
    # ["salt", "salt", "SALT"] => ["salt"]
    seen, out = set(), []
    for x in items:
        x = (x or "").strip().lower()
        if x and x not in seen:
            seen.add(x)
            out.append(x)
    return out


def extract_ingredients(description):
    if description is None: return []  # guard for NaN
    text = str(description).strip()
    if not text: return []  # guard for empty
    # reinforce JSON-only on the user message
    user_msg = f"""Input: {text}
Return ONLY a valid JSON array of lowercase ingredient names, no explanations, no code fences.
Example: ["chicken","butter"]"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": SYSTEM_MSG_INGREDIENTS},
            {"role": "user", "content": user_msg},
        ],
        max_tokens=600,
        temperature=0,
    )
    content = response.choices[0].message.content
    # try to parse; try to repair with one retry if not valid JSON
    try:
        parsed = json.loads(content)
    except json.JSONDecodeError:
        repair_msg = f"""Your previous response was not valid JSON.
Return ONLY a valid JSON array of strings, no explanations, no code fences.
Example: ["chicken","butter"]

Input: {text}"""
        response2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYSTEM_MSG_INGREDIENTS},
                {"role": "user", "content": repair_msg},
            ],
            max_tokens=600,
            temperature=0,
        )
        content = response2.choices[0].message.content
        try:
            parsed = json.loads(content)
        except json.JSONDecodeError:
            return []  # give up gracefully if still invalid

    # Normalize shape
    if isinstance(parsed, list):
        return _deduplicate_preserve_order([str(x) for x in parsed])
    # # Flatten if model returned an object (shouldn’t for single row)
    if isinstance(parsed, dict):
        flat = []
        for v in parsed.values():
            if isinstance(v, list):
                flat.extend([str(x) for x in v])
        return _deduplicate_preserve_order(flat)
    return []


# apply function to ingredients column
food_kg_df["ingredients_normalized"] = food_kg_df["ingredients"].apply(extract_ingredients)

In [None]:
# CSV can’t store native Python lists (they are objects), so use JSON dump
ingredients_table = pd.DataFrame({
    "id": food_kg_df["id"],
    # "ingredients_raw": food_kg_df["ingredients"],
    "ingredients_normalized": food_kg_df["ingredients_normalized"].apply(json.dumps),
})
# Create directory if it doesn't exist and save
os.makedirs("ingredients", exist_ok=True)
ingredients_table.to_csv("ingredients/ingredients.csv", index=False)

In [None]:
ingredients_table.head(5)

In [None]:
# Test load to get Python lists
df = pd.read_csv("ingredients/ingredients.csv")
df["ingredients_normalized"] = df["ingredients_normalized"].apply(json.loads)
df.head()