In [7]:
from dotenv import load_dotenv

load_dotenv()  # Take environment variables from .env
import pandas as pd
import json
from openai import OpenAI
import os

with open("SpacyProcessing/spacy_unique_ingredients.txt", "r", encoding="utf-8") as f:
    UNIQUE_INGREDIENTS = f.read()

with open("prompts/system_message_products.txt", "r", encoding="utf-8") as f:
    SYSTEM_MSG_PRODUCTS = f.read()
    SYSTEM_MSG_PRODUCTS += UNIQUE_INGREDIENTS

In [6]:
# CONFIGURATION
INPUT_CSV = "data/raw/usda_2022_food_branded_experimental_DESCRIPTION_ONLY.csv"  # Your source file
BATCH_FILE_NAME = "batch_input.jsonl"  # The file we will send to OpenAI
MODEL_NAME = "gpt-5-nano-2025-08-07"  # <--- UPDATED MODEL
API_KEY = os.getenv("OPENAI_API_KEY")  # Ensure your env var is set

# Initialize Client
client = OpenAI(api_key=API_KEY)

# 1. Load your data
print(f"Loading data from {INPUT_CSV}...")
df = pd.read_csv(INPUT_CSV)
df.head(3)

Loading data from data/raw/usda_2022_food_branded_experimental_DESCRIPTION_ONLY.csv...


Unnamed: 0,fdc_id,description
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar..."
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger..."
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa..."


In [9]:
print(SYSTEM_MSG_PRODUCTS)

You are a culinary data annotator.
Your job is to map a retail food product name to exactly one canonical ingredient label.

Return **only** a plain lowercase string — no JSON, no lists, no quotes, no commentary.

---

# TASK
Given a product name from a grocery or restaurant context, output the **single best ingredient label** it represents.

Use the ALLOWED_INGREDIENTS list as your **reference vocabulary**, **not a restriction**.
These ingredients are *examples* drawn from a prior extraction process.
If none match well, you may output **any realistic ingredient name**, even if it is not in ALLOWED_INGREDIENTS.

---

# RULES (perform mentally)

## 1. Normalize text
Lowercase everything; remove punctuation, symbols, brand names, marketing language (organic, classic, premium), sizes/quantities (oz, pack, 12-ct), and prep descriptors (sliced, shredded, spicy).
Preserve identity-defining terms (olive oil, soy sauce, apple cider vinegar, taco seasoning, pudding mix).

## 2. Remove usage phr

In [None]:
# 2. Define your System Prompt (Paste your long >1024 token prompt here)
# Since you mentioned your prompt is long, just paste it entirely below.
system_prompt = """
You are a specialized food database assistant.
[...PASTE YOUR FULL 1024+ TOKEN PROMPT HERE...]
Your task is to extract the simple, generic food type from the description.
Remove brand names, quantities, and packaging details.
Return ONLY the lowercase food type.
"""

# 3. Prepare the Batch Requests
batch_requests = []

print("Preparing batch requests...")
for index, row in df.iterrows():
    # We use fdc_id as the custom_id to map it back later
    custom_id = str(row['fdc_id'])

    request_body = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": MODEL_NAME,  # Using gpt-5-nano-2025-08-07
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": row['description']}
            ],
            "max_tokens": 50,  # Keep this low for simple extraction to save money
            "temperature": 0
        }
    }
    batch_requests.append(request_body)

# 4. Save to JSONL
with open(BATCH_FILE_NAME, "w") as f:
    for req in batch_requests:
        f.write(json.dumps(req) + "\n")

print(f"Saved {len(batch_requests)} requests to {BATCH_FILE_NAME}")

# 5. Upload File to OpenAI
print("Uploading file to OpenAI...")
batch_file = client.files.create(
    file=open(BATCH_FILE_NAME, "rb"),
    purpose="batch"
)
print(f"File uploaded. File ID: {batch_file.id}")

# 6. Create the Batch Job
print("Starting batch job...")
batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

print("-" * 30)
print(f"SUCCESS! Batch Job Created.")
print(f"Batch ID: {batch_job.id}")
print("-" * 30)
print("SAVE THIS BATCH ID. You will need it for Script 2.")