In [7]:
from dotenv import load_dotenv

load_dotenv()  # Take environment variables from .env
import pandas as pd
import json
from openai import OpenAI
import os
from math import ceil

start_offset = 50000

BATCH_SIZE = 25000

with open("SpacyProcessing/spacy_unique_ingredients.txt", "r", encoding="utf-8") as f:
    UNIQUE_INGREDIENTS = f.read()

with open("prompts/system_message_products.txt", "r", encoding="utf-8") as f:
    SYSTEM_MSG_PRODUCTS = f.read()
    SYSTEM_MSG_PRODUCTS += UNIQUE_INGREDIENTS

# CONFIGURATION
INPUT_CSV = "data/raw/usda_2022_food_branded_experimental_DESCRIPTION_ONLY.csv"  # Your source file
TRACKER_FILE = "batch_tracker.json"  # Where we save Batch IDs
BATCH_FILE_NAME = "batch_input.jsonl"  # The file we will send to OpenAI
MODEL_NAME = "gpt-4o-mini-2024-07-18"  # this model does not think
API_KEY = os.getenv("OPENAI_API_KEY")  # Ensure your env var is set
# Initialize Client
client = OpenAI(api_key=API_KEY)

In [8]:
print(SYSTEM_MSG_PRODUCTS)

**Role:** Culinary Data Annotator
**Task:** Map product names to **one** canonical ingredient label (lowercase string only).

# INSTRUCTIONS
1.  **Normalize:** Remove brands, quantities, packaging (can, box), marketing (organic, premium), and prep descriptors (sliced, spicy).
    * *Keep* identity terms (e.g., *olive oil, taco seasoning, pudding mix*).
2.  **Vocabulary:** Use `ALLOWED_INGREDIENTS` as a **priority list**, not a hard limit. If no match fits, output the best real ingredient name.
3.  **Heuristics:**
    * **Proteins:** Keep meat cuts (*ground beef*) and processed meats (*bacon*). Generalize seafood (*salmon fillet* → *salmon*).
    * **Pantry:** Keep explicit types (*tomato paste, olive oil*). Infer context (*soda* → *baking soda*, *coating* → *breadcrumb*).
    * **Mixes:** Keep canonical mixes (*cake mix, taco seasoning*).
    * **Formatting:** Singularize nouns. Drop "for/with X". No dish names.

# EXAMPLES
kraft shake ’n bake coating for pork → breadcrumb
mccormick ta

In [9]:
# 1. Load Data
print("Loading dataset...")
df = pd.read_csv(INPUT_CSV, skiprows=start_offset)
total_rows = len(df)
num_batches = ceil(total_rows / BATCH_SIZE)
print(f"Total rows: {total_rows} | Will create {num_batches} batches.")

Loading dataset...
Total rows: 1716279 | Will create 69 batches.


In [10]:
df.head()

Unnamed: 0,371449,"LA ESTRELLA, WHOLE WHEAT TOASTS"
0,371450,WHOLE WHEAT TOAST
1,371451,"ANTIUXIXONA, 72% COCOA INTENSE DARK CHOCOLATE"
2,371452,"LINDT, PISTACHIO SWISS MILK CHOCOLATE"
3,371453,CONCON OF MILK
4,371454,"SOY DOMINICANO, PAPAYA SLICES IN HEAVY SYRUP"


In [5]:
# Initialize tracker list
batch_tracking_data = []

# 2. Loop through chunks
for i in range(num_batches):
    start_idx = i * BATCH_SIZE
    end_idx = min((i + 1) * BATCH_SIZE, total_rows)

    # Get the chunk
    df_chunk = df.iloc[start_idx:end_idx]

    print(f"\nProcessing Batch {i + 1}/{num_batches} (Rows {start_idx} to {end_idx})...")

    # Create JSONL content
    jsonl_filename = f"batch_input_part_{i + 1}.jsonl"
    requests = []

    for index, row in df_chunk.iterrows():
        req = {
            "custom_id": str(row['fdc_id']),  # OR fdc_id, whichever is your unique key
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": MODEL_NAME,  # Optimized for cost
                "messages": [
                    {"role": "system", "content": SYSTEM_MSG_PRODUCTS},
                    {"role": "user", "content": row['description']}
                ],
                "max_tokens": 10,  # Optimized to stop "yapping", use for gpt4 only
            }
        }
        requests.append(req)

    # Write JSONL file
    with open(jsonl_filename, "w") as f:
        for req in requests:
            f.write(json.dumps(req) + "\n")

    # Upload File
    print(f"Uploading {jsonl_filename}...")
    batch_file = client.files.create(
        file=open(jsonl_filename, "rb"),
        purpose="batch"
    )

    # Create Batch Job
    print("Submitting Batch Job...")
    batch_job = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )

    # Add to tracker
    batch_tracking_data.append({
        "batch_index": i + 1,
        "batch_id": batch_job.id,
        "status": "submitted",  # Initial status
        "output_csv_name": f"results_part_{i + 1}.csv"
    })

    print(f"Batch {i + 1} Submitted! ID: {batch_job.id}")


Processing Batch 1/71 (Rows 0 to 25000)...
Uploading batch_input_part_1.jsonl...
Submitting Batch Job...
Batch 1 Submitted! ID: batch_692fe677d4cc81909f75995f869e934a

Processing Batch 2/71 (Rows 25000 to 50000)...
Uploading batch_input_part_2.jsonl...
Submitting Batch Job...
Batch 2 Submitted! ID: batch_692fe69614488190b69c402d1da9aa2c

Processing Batch 3/71 (Rows 50000 to 75000)...
Uploading batch_input_part_3.jsonl...
Submitting Batch Job...
Batch 3 Submitted! ID: batch_692fe6b364648190b4f91f9ecec96e1b

Processing Batch 4/71 (Rows 75000 to 100000)...
Uploading batch_input_part_4.jsonl...
Submitting Batch Job...
Batch 4 Submitted! ID: batch_692fe6cd47608190b8560e27d9cf7c24

Processing Batch 5/71 (Rows 100000 to 125000)...
Uploading batch_input_part_5.jsonl...
Submitting Batch Job...
Batch 5 Submitted! ID: batch_692fe6e67164819092fce113608448df

Processing Batch 6/71 (Rows 125000 to 150000)...
Uploading batch_input_part_6.jsonl...
Submitting Batch Job...
Batch 6 Submitted! ID: batch_

KeyboardInterrupt: 

In [7]:
with open(TRACKER_FILE, "w") as f:
    json.dump(batch_tracking_data, f, indent=4)

print(f"\nAll batches submitted. IDs saved to {TRACKER_FILE}")


All batches submitted. IDs saved to batch_tracker.json
