In [1]:
from dotenv import load_dotenv

load_dotenv()  # Take environment variables from .env
import pandas as pd
import json
from openai import OpenAI
import os

BATH_SIZE_MAX = 50000

with open("SpacyProcessing/spacy_unique_ingredients.txt", "r", encoding="utf-8") as f:
    UNIQUE_INGREDIENTS = f.read()

with open("prompts/system_message_products.txt", "r", encoding="utf-8") as f:
    SYSTEM_MSG_PRODUCTS = f.read()
    SYSTEM_MSG_PRODUCTS += UNIQUE_INGREDIENTS

In [2]:
# CONFIGURATION
INPUT_CSV = "data/raw/usda_2022_food_branded_experimental_DESCRIPTION_ONLY.csv"  # Your source file
BATCH_FILE_NAME = "batch_input.jsonl"  # The file we will send to OpenAI
MODEL_NAME = "gpt-5-nano-2025-08-07"  # <--- UPDATED MODEL
API_KEY = os.getenv("OPENAI_API_KEY")  # Ensure your env var is set

# Initialize Client
client = OpenAI(api_key=API_KEY)

# 1. Load your data
print(f"Loading data from {INPUT_CSV}...")
df = pd.read_csv(INPUT_CSV).sort_values(by="fdc_id", ascending=True).head(100)
print(len(df))
df.head(3)

Loading data from data/raw/usda_2022_food_branded_experimental_DESCRIPTION_ONLY.csv...
100


Unnamed: 0,fdc_id,description
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar..."
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger..."
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa..."


In [3]:
print(SYSTEM_MSG_PRODUCTS)

**Role:** Culinary Data Annotator
**Task:** Map product names to **one** canonical ingredient label (lowercase string only).

# INSTRUCTIONS
1.  **Normalize:** Remove brands, quantities, packaging (can, box), marketing (organic, premium), and prep descriptors (sliced, spicy).
    * *Keep* identity terms (e.g., *olive oil, taco seasoning, pudding mix*).
2.  **Vocabulary:** Use `ALLOWED_INGREDIENTS` as a **priority list**, not a hard limit. If no match fits, output the best real ingredient name.
3.  **Heuristics:**
    * **Proteins:** Keep meat cuts (*ground beef*) and processed meats (*bacon*). Generalize seafood (*salmon fillet* → *salmon*).
    * **Pantry:** Keep explicit types (*tomato paste, olive oil*). Infer context (*soda* → *baking soda*, *coating* → *breadcrumb*).
    * **Mixes:** Keep canonical mixes (*cake mix, taco seasoning*).
    * **Formatting:** Singularize nouns. Drop "for/with X". No dish names.

# EXAMPLES
kraft shake ’n bake coating for pork → breadcrumb
mccormick ta

In [4]:
batch_requests = []

print("Preparing batch requests...")
for index, row in df.iterrows():
    # We use fdc_id as the custom_id to map it back later
    custom_id = str(row['fdc_id'])

    request_body = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": MODEL_NAME,  # Using gpt-5-nano-2025-08-07
            "messages": [
                {"role": "system", "content": SYSTEM_MSG_PRODUCTS},
                {"role": "user", "content": row['description']}
            ],
            # "max_completion_tokens": 50,  # Keep this low for simple extraction to save money
        }
    }
    batch_requests.append(request_body)

# 4. Save to JSONL
with open(BATCH_FILE_NAME, "w") as f:
    for req in batch_requests:
        f.write(json.dumps(req) + "\n")

print(f"Saved {len(batch_requests)} requests to {BATCH_FILE_NAME}")

# 5. Upload File to OpenAI
print("Uploading file to OpenAI...")
batch_file = client.files.create(
    file=open(BATCH_FILE_NAME, "rb"),
    purpose="batch"
)
print(f"File uploaded. File ID: {batch_file.id}")

# 6. Create the Batch Job
print("Starting batch job...")
batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

print("-" * 30)
print(f"SUCCESS! Batch Job Created.")
print(f"Batch ID: {batch_job.id}")
print("-" * 30)
print("SAVE THIS BATCH ID. You will need it for Script 2.")

Preparing batch requests...
Saved 100 requests to batch_input.jsonl
Uploading file to OpenAI...
File uploaded. File ID: file-4uteefXLiudkR6cU76ioAC
Starting batch job...
------------------------------
SUCCESS! Batch Job Created.
Batch ID: batch_692fce7942c48190913fce8d55e48ee5
------------------------------
SAVE THIS BATCH ID. You will need it for Script 2.


In [6]:
# CONFIGURATION
BATCH_ID = batch_job.id  # <--- PASTE YOUR BATCH ID HERE
OUTPUT_JSONL = "batch_output.jsonl"  # Where to save raw AI results
FINAL_CSV = "mapped_ingredients_only.csv"  # <--- New smaller file

API_KEY = os.getenv("OPENAI_API_KEY")  # Ensure your env var is set


def get_results():
    # 1. Check Status
    print(f"Checking status for {BATCH_ID}...")
    try:
        batch_job = client.batches.retrieve(BATCH_ID)
        print(f"Current Status: {batch_job.status}")
    except Exception as e:
        print(f"Error retrieving batch: {e}")
        return

    if batch_job.status == "failed":
        print(f"Batch Failed. Error: {batch_job.errors}")
        return

    if batch_job.status != "completed":
        print("Job is not finished yet. Try again later.")
        return

    # 2. Download Results
    print("Job completed! Downloading results...")
    output_file_id = batch_job.output_file_id
    file_response = client.files.content(output_file_id)
    content = file_response.content

    with open(OUTPUT_JSONL, "wb") as f:
        f.write(content)
    print(f"Raw results saved to {OUTPUT_JSONL}")

    # 3. Parse Results
    print("Processing results...")
    data_list = []

    with open(OUTPUT_JSONL, "r") as f:
        for line in f:
            data = json.loads(line)
            custom_id = data['custom_id']  # This is your fdc_id

            # Extract content or handle error
            if data['response']['status_code'] == 200:
                ingredient = data['response']['body']['choices'][0]['message']['content'].strip()
            else:
                ingredient = "ERROR_API_FAILURE"

            # Append to list
            data_list.append({
                "fdc_id": custom_id,
                "mapped_ingredient": ingredient
            })

    # 4. Save New CSV (Only fdc_id and mapped_ingredient)
    df_new = pd.DataFrame(data_list)

    # Optional: Sort by ID if you want neatness
    # df_new = df_new.sort_values(by="fdc_id")

    df_new.to_csv(FINAL_CSV, index=False)

    print("-" * 30)
    print(f"DONE! Saved {len(df_new)} rows to {FINAL_CSV}")
    print(df_new.head())


get_results()

Checking status for batch_692fce7942c48190913fce8d55e48ee5...
Current Status: completed
Job completed! Downloading results...
Raw results saved to batch_output.jsonl
Processing results...
------------------------------
DONE! Saved 100 rows to mapped_ingredients_only.csv
   fdc_id mapped_ingredient
0  167512        buttermilk
1  167513             dough
2  167514        breadcrumb
3  167515  allpurpose flour
4  167516        buttermilk
