Goal: separate the ingrdients in the FoodKG table into a list of ingredients

In [1]:
# !pip install --upgrade pip -q
# !pip install psycopg2-binary -q
# !pip install SQLAlchemy -q
# !pip install spacy -q
# !python -m spacy download en_core_web_sm -q

In [2]:
import json
import os

import pandas as pd
from dotenv import load_dotenv

from util import process_branded_food_experimental_df, assemble_branded_food_experimental_df

# from sqlalchemy import create_engine

load_dotenv()  # Take environment variables from .env
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:1234/v1",  # local model
    api_key=api_key
)

In [3]:
# # Credentials
# host = os.getenv("POSTGRES_HOST")
# port = os.getenv("POSTGRES_PORT")
# database = os.getenv("POSTGRES_DATABASE")
# user = os.getenv("POSTGRES_USER")
# password = os.getenv("POSTGRES_PASSWORD")
# # Create the connection engine
# engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}")
# # Test the connection
# try:
#     with engine.connect() as conn:
#         print("✅ Connected successfully!")
# except Exception as e:
#     print("❌ Connection failed:", e)

In [4]:
# # Test: query 5 rows
# query = "SELECT * FROM \"FoodKG\" LIMIT 5;"  # quotes protect uppercase table names
# df = pd.read_sql(query, engine)
# # display nicely
# pd.set_option('display.max_columns', None)  # show all columns
# pd.set_option('display.max_rows', 5)  # show up to 5 rows
# display(df)

In [5]:
df = pd.read_csv("data/usda_2022_food_branded_experimental_DESCRIPTION_ONLY.csv")

In [6]:
df.head()

Unnamed: 0,fdc_id,description
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar..."
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger..."
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa..."
3,167515,"George Weston Bakeries, Thomas English Muffins"
4,167516,"Waffles, buttermilk, frozen, ready-to-heat"


In [7]:
process_branded_food_experimental_df(
    df=df,
    model="qwen/qwen3-4b-2507",
    client=client,
    batch_size=100,
    restart=True
)

Wrote part_167512_167521.csv (0:10) — checkpoint=167521
Wrote part_167522_167531.csv (10:20) — checkpoint=167531
Wrote part_167532_167541.csv (20:30) — checkpoint=167541
Done.


In [8]:
final_df = assemble_branded_food_experimental_df()

In [9]:
len(final_df)

60

In [10]:
# CSV can’t store native Python lists (they are objects), so use JSON dump
product_table = pd.DataFrame({
    "fdc_id": final_df["fdc_id"],
    "description": final_df["description"],
    "mapped_ingredient": final_df["mapped_ingredient"].apply(json.dumps),
}).sort_values("fdc_id")
# Create directory if it doesn't exist and save
os.makedirs("sample", exist_ok=True)
product_table.to_csv("sample/food_branded_experimental_normalized.csv", index=False)

In [11]:
product_table.head()

Unnamed: 0,fdc_id,description,mapped_ingredient
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...","""biscuit"""
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...","""cinnamon rolls"""
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...","""bread crumbs"""
3,167515,"George Weston Bakeries, Thomas English Muffins","""english muffin"""
4,167516,"Waffles, buttermilk, frozen, ready-to-heat","""waffles"""
