Goal: separate the ingrdients in the FoodKG table into a list of ingredients

In [1]:
import json
import os
import time

import pandas as pd
from dotenv import load_dotenv

from llm_util import process_branded_food_experimental_df, assemble_branded_food_experimental_df

load_dotenv()  # Take environment variables from .env
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI

client = OpenAI(
    # base_url="http://localhost:1234/v1",  # local model
    api_key=api_key
)

In [2]:
df = pd.read_csv("data/raw/usda_2022_food_branded_experimental_DESCRIPTION_ONLY.csv")

In [3]:
df.head(5)

Unnamed: 0,fdc_id,description
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar..."
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger..."
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa..."
3,167515,"George Weston Bakeries, Thomas English Muffins"
4,167516,"Waffles, buttermilk, frozen, ready-to-heat"


In [8]:
model = "gpt-5-nano-2025-08-07"
# model = "qwen/qwen3-4b-2507"
# model="openai/gpt-oss-20b",
start = time.time()
process_branded_food_experimental_df(
    df=df,
    model=model,
    client=client,
    restart=True,
    batch_size=100,
    stop_at=1000,
)
end = time.time()
print(f"Total time elapsed: {end - start:.4f} seconds")

Wrote part_167512_167611.csv (0:100) — checkpoint=167611
Wrote part_167612_167711.csv (100:200) — checkpoint=167711
Wrote part_167712_167811.csv (200:300) — checkpoint=167811
Wrote part_167812_167911.csv (300:400) — checkpoint=167911
Wrote part_167912_168011.csv (400:500) — checkpoint=168011
Wrote part_168012_168111.csv (500:600) — checkpoint=168111
Wrote part_168112_168211.csv (600:700) — checkpoint=168211
Wrote part_168212_168311.csv (700:800) — checkpoint=168311
Wrote part_168312_168411.csv (800:900) — checkpoint=168411
Wrote part_168412_168511.csv (900:1000) — checkpoint=168511
Wrote part_168512_168611.csv (1000:1100) — checkpoint=168611
Wrote part_168612_168711.csv (1100:1200) — checkpoint=168711
Wrote part_168712_168811.csv (1200:1300) — checkpoint=168811
Wrote part_168812_168911.csv (1300:1400) — checkpoint=168911
Wrote part_168912_169011.csv (1400:1500) — checkpoint=169011
Wrote part_169012_169111.csv (1500:1600) — checkpoint=169111
Wrote part_169112_169211.csv (1600:1700) — ch

KeyboardInterrupt: 

In [9]:
final_df = assemble_branded_food_experimental_df()

In [10]:
len(final_df)

23800

In [11]:
# CSV can’t store native Python lists (they are objects), so use JSON dump
product_table = pd.DataFrame({
    "fdc_id": final_df["fdc_id"],
    "description": final_df["description"],
    "mapped_ingredient": final_df["mapped_ingredient"].apply(json.dumps),
}).sort_values("fdc_id")

In [12]:
product_table.head(5)

Unnamed: 0,fdc_id,description,mapped_ingredient
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...","""biscuit"""
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...","""cinnamon roll"""
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...","""breadcrumb"""
3,167515,"George Weston Bakeries, Thomas English Muffins","""muffin"""
4,167516,"Waffles, buttermilk, frozen, ready-to-heat","""waffle"""


In [13]:
# Create a directory if it doesn't exist and save
os.makedirs("data/output", exist_ok=True)
product_table.to_csv("data/output/processed_products.csv", index=False)