In [2]:
from dotenv import load_dotenv

load_dotenv()  # Take environment variables from .env
import pandas as pd
import json
from openai import OpenAI
import os

TRACKER_FILE = "batch_tracker.json"
API_KEY = os.getenv("OPENAI_API_KEY")  # Ensure your env var is set
client = OpenAI(api_key=API_KEY)

In [3]:
def check_and_retrieve():
    # 1. Load Tracker
    if not os.path.exists(TRACKER_FILE):
        print("No tracker file found. Run submission script first.")
        return

    with open(TRACKER_FILE, "r") as f:
        batches = json.load(f)

    all_complete = True

    print(f"Checking status for {len(batches)} batches...\n")

    for entry in batches:
        # Skip if we already finished this one
        if entry['status'] == "downloaded":
            continue

        all_complete = False
        batch_id = entry['batch_id']

        try:
            # Check Status
            batch_job = client.batches.retrieve(batch_id)
            print(f"Batch {entry['batch_index']} ({batch_id}): {batch_job.status}")

            # Update status in our local memory
            entry['status'] = batch_job.status

            if batch_job.status == "completed":
                print(f"--> Downloading results for Batch {entry['batch_index']}...")

                # Download
                content = client.files.content(batch_job.output_file_id).content

                # Parse JSONL results immediately
                # (We process in memory to avoid saving huge JSONL files)
                results_list = []
                for line in content.decode('utf-8').splitlines():
                    data = json.loads(line)
                    custom_id = data['custom_id']

                    if data['response']['status_code'] == 200:
                        val = data['response']['body']['choices'][0]['message']['content'].strip()
                    else:
                        val = "ERROR"

                    results_list.append({
                        "fdc_id": custom_id,
                        "mapped_ingredient": val
                    })

                # Save to individual CSV part
                df_res = pd.DataFrame(results_list)
                df_res.to_csv(entry['output_csv_name'], index=False)

                print(f"--> Saved {entry['output_csv_name']}")
                entry['status'] = "downloaded" # Mark as done so we don't check again

            elif batch_job.status == "failed":
                print(f"--> Batch Failed! Error: {batch_job.errors}")

        except Exception as e:
            print(f"Error checking batch {batch_id}: {e}")

    # 2. Save updated tracker
    with open(TRACKER_FILE, "w") as f:
        json.dump(batches, f, indent=4)

    if all_complete:
        print("\nALL BATCHES COMPLETE AND DOWNLOADED.")
        # Optional: Merge all CSVs here if you want
    else:
        print("\nSome batches are still processing. Run this script again later.")

In [4]:
check_and_retrieve()

Checking status for 5 batches...

Batch 1 (batch_692fdf4f6a0c81909a1b95b383372c00): in_progress
Batch 2 (batch_692fdf504a6881909161d33e1cb8ab1f): completed
--> Downloading results for Batch 2...
--> Saved results_part_2.csv
Batch 3 (batch_692fdf5103e08190aee3088cf060f486): completed
--> Downloading results for Batch 3...
--> Saved results_part_3.csv
Batch 4 (batch_692fdf520fd88190b45aa39242401ad4): completed
--> Downloading results for Batch 4...
--> Saved results_part_4.csv
Batch 5 (batch_692fdf52a6cc819085496e3b3ef98a33): completed
--> Downloading results for Batch 5...
--> Saved results_part_5.csv

Some batches are still processing. Run this script again later.
