Goal: separate the ingrdients in the FoodKG table into a list of ingredients

In [1]:
import json
import os
import time

import pandas as pd
from dotenv import load_dotenv

from llm_util import process_branded_food_experimental_df, assemble_branded_food_experimental_df

load_dotenv()  # Take environment variables from .env
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI

client = OpenAI(
    api_key=api_key
)

In [2]:
df = pd.read_csv("data/raw/usda_2022_food_branded_experimental_DESCRIPTION_ONLY.csv")

In [3]:
df.head(5)

Unnamed: 0,fdc_id,description
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar..."
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger..."
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa..."
3,167515,"George Weston Bakeries, Thomas English Muffins"
4,167516,"Waffles, buttermilk, frozen, ready-to-heat"


In [4]:
model = "gpt-5-nano-2025-08-07"
start = time.time()
process_branded_food_experimental_df(
    df=df,
    model=model,
    client=client,
    restart=True,
    batch_size=1,
    stop_at=1,
)
end = time.time()
print(f"Total time elapsed: {end - start:.4f} seconds")

Response:
 Response(id='resp_0714bfa5c2dfed8600692fc0ed8c688196af1be6617c8b2ba8', created_at=1764737261.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-5-nano-2025-08-07', object='response', output=[ResponseReasoningItem(id='rs_0714bfa5c2dfed8600692fc0edeff08196be89fa5b010994ff', summary=[], type='reasoning', content=None, encrypted_content=None, status=None), ResponseOutputMessage(id='msg_0714bfa5c2dfed8600692fc0f97b7c8196a04d2fd2be85810d', content=[ResponseOutputText(annotations=[], text='buttermilk biscuit dough', type='output_text', logprobs=[])], role='assistant', status='completed', type='message')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[], top_p=1.0, background=False, conversation=None, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, prompt_cache_key=None, reasoning=Reasoning(effort='medium', generate_summary=None, summary=None), safety_identifier=None, service_tier='default',

In [5]:
final_df = assemble_branded_food_experimental_df()

In [6]:
len(final_df)

101

In [7]:
# CSV can’t store native Python lists (they are objects), so use JSON dump
product_table = pd.DataFrame({
    "fdc_id": final_df["fdc_id"],
    "description": final_df["description"],
    "mapped_ingredient": final_df["mapped_ingredient"].apply(json.dumps),
}).sort_values("fdc_id")

In [8]:
product_table.head(5)

Unnamed: 0,fdc_id,description,mapped_ingredient
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...","""buttermilk biscuit dough"""
1,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",
2,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",
3,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",
4,167515,"George Weston Bakeries, Thomas English Muffins",


In [9]:
# Create a directory if it doesn't exist and save
os.makedirs("data/output", exist_ok=True)
product_table.to_csv("data/output/processed_products.csv", index=False)