Goal: separate the ingrdients in the FoodKG table into a list of ingredients

In [1]:
import json
import os
import time

import pandas as pd
from dotenv import load_dotenv

from llm_util import process_branded_food_experimental_df, assemble_branded_food_experimental_df

load_dotenv()  # Take environment variables from .env
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI

client = OpenAI(
    api_key=api_key
)

In [2]:
df = pd.read_csv("data/raw/usda_2022_food_branded_experimental_DESCRIPTION_ONLY.csv")

In [3]:
df.head(5)

Unnamed: 0,fdc_id,description
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar..."
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger..."
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa..."
3,167515,"George Weston Bakeries, Thomas English Muffins"
4,167516,"Waffles, buttermilk, frozen, ready-to-heat"


In [None]:
model = "gpt-5-nano-2025-08-07"
start = time.time()
process_branded_food_experimental_df(
    df=df,
    model=model,
    client=client,
    restart=True,
    batch_size=100,
    stop_at=100,
)
end = time.time()
print(f"Total time elapsed: {end - start:.4f} seconds")

In [9]:
final_df = assemble_branded_food_experimental_df()

In [10]:
len(final_df)

23800

In [11]:
# CSV canâ€™t store native Python lists (they are objects), so use JSON dump
product_table = pd.DataFrame({
    "fdc_id": final_df["fdc_id"],
    "description": final_df["description"],
    "mapped_ingredient": final_df["mapped_ingredient"].apply(json.dumps),
}).sort_values("fdc_id")

In [12]:
product_table.head(5)

Unnamed: 0,fdc_id,description,mapped_ingredient
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...","""biscuit"""
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...","""cinnamon roll"""
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...","""breadcrumb"""
3,167515,"George Weston Bakeries, Thomas English Muffins","""muffin"""
4,167516,"Waffles, buttermilk, frozen, ready-to-heat","""waffle"""


In [13]:
# Create a directory if it doesn't exist and save
os.makedirs("data/output", exist_ok=True)
product_table.to_csv("data/output/processed_products.csv", index=False)