In [1]:
import os
import time
import json
import openai
import pandas as pd
from tqdm import tqdm
from secret import OPENAI_API_KEY

import regex  # instead of re

REQUESTING_PROMPT = """Please extract answers for A1–A32 and B1–B24 from the uploaded MR study, using the definitions in RubricQ.pdf.
Output format:
Provide exactly two rows only inside a Markdown code block (triple backticks) so it appears as a copy‑pasteable grey‑cell table.
Row 1: Tab‑delimited answers only (Yes / No / Partial / Not applicable). The first field must be the StudyID in the format AuthorYear (retain all characters in the author’s name exactly as written in the paper). Followed by A1→A32 then B1→B24 (exactly 57 fields separated by 56 tab characters).
Row 2: Tab‑delimited reasons only. The first field must again be the same StudyID. Each reason must be one short quoted phrase (no line breaks). If information is missing, write “Not reported”. DO NOT leave any field blank. Must contain exactly 57 fields separated by 56 tab characters, aligned one-to-one with Row 1.
Rules:
- Do not include headers, numbering, bullet points, or extra text.
- Clearly separate Row 1 and Row 2 with one blank line. 
- Before returning, validate: Count = 57 fields for each row (if not, auto‑fill “Not reported” until 57).
"""

def extract_json_objects(text):
    # Try to load full string as a list of dicts
    try:
        data = json.loads(text)
        if isinstance(data, list):
            return data[0]
        else:
            return data
    except json.JSONDecodeError:
        print("Response:")  # Print first 500 chars for debugging
        print(text[:500])
        pass  # Try regex fallback below

    # Fallback regex in case it's not clean JSON
    pattern = r"\{(?:[^{}]|(?R))*\}"  # recursively match nested {}
    matches = regex.findall(pattern, text)

    objects = []
    for m in matches:
        try:
            obj = json.loads(m)
            objects.append(obj)
        except json.JSONDecodeError:
            continue

    return objects[0] if objects else None


# Set your OpenAI API key
openai.api_key = OPENAI_API_KEY  # Ensure this environment variable is set

# Paths
pdf_dir = "./documents"
output_csv = "chatgpt_new_extracted_mr_data.csv"

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=openai.api_key,
)

# Upload the Word file (rubric)
with open("./RubricQ.pdf", "rb") as f:
    rubric_file = client.files.create(file=f, purpose="assistants")

# Create assistant WITHOUT attaching rubric file
assistant = client.beta.assistants.create(
    name="MR Study Extractor",
    instructions="You are a genetic epidemiologist and Methods Audit reviewer. Refer to the uploaded rubric for all scoring and extraction instructions.",
    model="gpt-4-turbo",
    tools=[{"type": "file_search"}],
)

# Create directory for responses
RESPONSE_DIR = "./responses"
os.makedirs(RESPONSE_DIR, exist_ok=True)


class GlobalDebug:
    pass

import random

def retry_with_backoff(func, *args, **kwargs):
    max_retries = 5
    delay = 10  # Start with 10 seconds
    for attempt in range(max_retries):
        try:
            return func(*args, **kwargs)
        except openai.RateLimitError as e:
            wait_time = getattr(e, "retry_after", delay)
            print(f"⚠️ Rate limit hit: retrying in {wait_time:.2f}s...")
            time.sleep(wait_time + random.uniform(0, 2))  # jitter
            delay = min(delay * 2, 60)  # exponential backoff, max 60s
        except openai.APIError as e:
            print(f"⚠️ API error: {e}. Retrying in {delay}s...")
            time.sleep(delay + random.uniform(0, 2))
            delay = min(delay * 2, 60)
        except Exception as e:
            print(f"❌ Unexpected error: {e}")
            raise
    raise RuntimeError("❌ Max retries exceeded.")


def process_pdf(file_path, index):
    file_name = os.path.basename(file_path)
    print(f"\n🔍 Processing [{index}] {file_name}")

    # Upload the PDF
    try:
        with open(file_path, "rb") as f:
            uploaded_pdf = client.files.create(file=f, purpose="assistants")
    except Exception as e:
        print(f"[ERROR][{index}] Failed to upload file {file_name}:\n{e}")
        return None

    # Create a thread
    try:
        thread = client.beta.threads.create()
    except Exception as e:
        print(f"[ERROR][{index}] Failed to create thread for {file_name}:\n{e}")
        return None

    def run_prompt(prompt, label):
        retry_with_backoff(
            client.beta.threads.messages.create,
            thread_id=thread.id,
            role="user",
            content=prompt,
            attachments=[
                {"file_id": uploaded_pdf.id, "tools": [{"type": "file_search"}]},
                {"file_id": rubric_file.id, "tools": [{"type": "file_search"}]},
            ],
        )

        run = retry_with_backoff(
            client.beta.threads.runs.create,
            thread_id=thread.id,
            assistant_id=assistant.id,
        )

        while True:
            run_status = client.beta.threads.runs.retrieve(
                thread_id=thread.id, run_id=run.id
            )

            GlobalDebug.run_status = run_status
            GlobalDebug.thread_id = thread.id
            GlobalDebug.run_id = run.id

            if run_status.status == "completed":
                break
            elif run_status.status == "failed":
                print(f"Status info: {run_status}")
                raise RuntimeError(f"Run failed on {label}")
            time.sleep(1)

        messages = client.beta.threads.messages.list(thread_id=thread.id)
        response_text = messages.data[0].content[0].text.value

        # Save raw response
        raw_path = os.path.join(RESPONSE_DIR, f"{index:03d}_{file_name}_{label}.txt")
        with open(raw_path, "w", encoding="utf-8") as f:
            f.write(response_text)

        return response_text

    # Run prompt and get data
    response_text = run_prompt(
        REQUESTING_PROMPT,
        "scoring",
    )

    if not response_text:
        print(f"[SKIP][{index}] Skipping {file_name} due to failed prompt.")
        return None

    combined = {"study_id": file_name, "response_text": response_text}
    return combined


# Process all PDFs in the directory
results = []
pdf_files = sorted([f for f in os.listdir(pdf_dir) if f.endswith(".pdf")])

for i, pdf_file in enumerate(tqdm(pdf_files, desc="Processing PDFs"), 1):
    pdf_path = os.path.join(pdf_dir, pdf_file)
    result = process_pdf(pdf_path, i)
    if result:
        results.append(result)
    time.sleep(90)


# create a file and save results in json
with open("extracted_data.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)


Processing PDFs:   0%|          | 0/3 [00:00<?, ?it/s]


🔍 Processing [1] #1 - Ahn 2020.pdf


  thread = client.beta.threads.create()
  return func(*args, **kwargs)
  run_status = client.beta.threads.runs.retrieve(
  messages = client.beta.threads.messages.list(thread_id=thread.id)
Processing PDFs:  33%|███▎      | 1/3 [01:53<03:46, 113.07s/it]


🔍 Processing [2] #3 - Aoki 2022.pdf


Processing PDFs:  67%|██████▋   | 2/3 [03:51<01:56, 116.04s/it]


🔍 Processing [3] #4 - AuYeung 2023.pdf


Processing PDFs: 100%|██████████| 3/3 [05:48<00:00, 116.07s/it]
