Goal: separate the ingrdients in the FoodKG table into a list of ingredients

In [1]:
# !pip install --upgrade pip -q
# !pip install psycopg2-binary -q
# !pip install SQLAlchemy -q
# !pip install spacy -q
# !python -m spacy download en_core_web_sm -q

In [2]:
import json
import os

import pandas as pd
from dotenv import load_dotenv

from util import process_food_kg_df, assemble_food_kg_df

# from sqlalchemy import create_engine

load_dotenv()  # Take environment variables from .env
api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:1234/v1",  # local model
    api_key=api_key
)

In [3]:
# # Credentials
# host = os.getenv("POSTGRES_HOST")
# port = os.getenv("POSTGRES_PORT")
# database = os.getenv("POSTGRES_DATABASE")
# user = os.getenv("POSTGRES_USER")
# password = os.getenv("POSTGRES_PASSWORD")
# # Create the connection engine
# engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}")
# # Test the connection
# try:
#     with engine.connect() as conn:
#         print("✅ Connected successfully!")
# except Exception as e:
#     print("❌ Connection failed:", e)

In [4]:
# # Test: query 5 rows
# query = "SELECT * FROM \"FoodKG\" LIMIT 5;"  # quotes protect uppercase table names
# df = pd.read_sql(query, engine)
# # display nicely
# pd.set_option('display.max_columns', None)  # show all columns
# pd.set_option('display.max_rows', 5)  # show up to 5 rows
# display(df)

In [5]:
df = pd.read_csv("data/FoodKG_ingredients.csv")

In [6]:
df.head()

Unnamed: 0,id,ingredients
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva..."
1,1,"['1 small jar chipped beef, cut up', '4 boned ..."
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg..."
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans..."
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ..."


In [7]:
process_food_kg_df(
    df=df,
    model="qwen/qwen3-4b-2507",
    client=client,
)

Wrote part_0_99.csv (0:100) — checkpoint=99
Wrote part_100_199.csv (100:200) — checkpoint=199
Wrote part_200_299.csv (200:300) — checkpoint=299
Wrote part_300_399.csv (300:400) — checkpoint=399
Wrote part_400_499.csv (400:500) — checkpoint=499
Wrote part_500_599.csv (500:600) — checkpoint=599
Wrote part_600_699.csv (600:700) — checkpoint=699
Wrote part_700_799.csv (700:800) — checkpoint=799
Wrote part_800_899.csv (800:900) — checkpoint=899
Wrote part_900_999.csv (900:1000) — checkpoint=999
Wrote part_1000_1099.csv (1000:1100) — checkpoint=1099
Wrote part_1100_1199.csv (1100:1200) — checkpoint=1199
Wrote part_1200_1299.csv (1200:1300) — checkpoint=1299
Wrote part_1300_1399.csv (1300:1400) — checkpoint=1399
Wrote part_1400_1499.csv (1400:1500) — checkpoint=1499
Wrote part_1500_1599.csv (1500:1600) — checkpoint=1599
Wrote part_1600_1699.csv (1600:1700) — checkpoint=1699
Wrote part_1700_1799.csv (1700:1800) — checkpoint=1799
Wrote part_1800_1899.csv (1800:1900) — checkpoint=1899
Wrote part

KeyboardInterrupt: 

In [8]:
final_df = assemble_food_kg_df()

In [9]:
len(final_df)

12300

In [13]:
# CSV can’t store native Python lists (they are objects), so use JSON dump
ingredients_table = pd.DataFrame({
    "id": final_df["id"],
    "ingredients_normalized": final_df["ingredients_normalized"].apply(json.dumps),
}).sort_values("id")
# Create directory if it doesn't exist and save
os.makedirs("sample", exist_ok=True)
ingredients_table.to_csv("sample/FoodKG_ingredients_normalized.csv", index=False)

In [12]:
ingredients_table.tail()

Unnamed: 0,id,ingredients_normalized
2795,12295,"""['butter', 'sugar', 'egg', 'flour', 'salt', '..."
2796,12296,"""['rice', 'salt', 'taco seasoning', 'cheese', ..."
2797,12297,"""['eggplant', 'bread crumb', 'm.s.g.', 'butter..."
2798,12298,"""['flour', 'baking powder', 'salt', 'shortenin..."
2799,12299,"""['brown rice', 'broccoli', 'mushroom soup', '..."
