In [1]:
import os
import time
import json
import openai
import pandas as pd
from tqdm import tqdm
from secret import OPENAI_API_KEY

import regex  # instead of re

REQUESTING_PROMPT = """You are a genetic epidemiologist extracting and scoring Mendelian randomization (MR) studies for a systematic review.

Follow these rules EXACTLY:
1. Use the scoring criteria provided in the uploaded rubric (I saw somewhere you uploaded a rubric)
2. Follow the extraction Instructions below verbatim
3. Return only a single valid JSON object in the schema described below.

Instructions:

You are a genetic epidemiologist and an expert in cross-ancestry, trans-ancestry, and multi-ancestry Mendelian randomization (MR) studies. 

You are one of two independent reviewers conducting a Methods Audit. This Methods Audit aims to evaluate the methodological quality of 300 published MR studies using a structured Scoring Rubric. The Scoring Rubric consists of two parts: 
- Part A includes 17 items covering Core MR Methodology.
- Part B includes 15 items covering Cross-Ancestry Extensions to MR Methodology. 

Your task:
Step 1: Assign a score for each rubric item using the following scale:
1 = Fully addressed
0 = Not addressed 
N/A = Item not applicable to study design 
Not assessed = If you find text but are uncertain how to assign 1, 0, or N/A

Step 2: Output two tab-delimited tables
Table 1 - Scoring Table
Output a single tab-delimited line containing exactly 32 fields in this column order:
StudyID, A1, A2a, A2b, A2c, A3a, A3b, A4, A5, A6, A7, A8, A9, A10a, A10b, A11a, A11b, A12, B1a, B1b, B2, B3a, B3b, B4, B5, B6, B7a, B7b, B8a, B8b, B9, B10, B11
Formatting rules: 
- Each field is separated by one tab character (\t).
- There must be exactly 31 tab characters per line.
- Valid values for each item: 1, 0, N/A, or "Not assessed". 
- If an item is Not Applicable, write N/A.
- If you cannot determine the score, write "Not assessed".
- Do not include column headers unless explicitly requested.
- No line breaks, extra tabs, or spaces within any field.
- Output must be plain UTF-8 text without BOM.

Table 2 - Evidence Table
Output one tab-delimited line per rubric item, containing exactly 5 fields in this column order:
StudyID, ItemCode, Score, QuotedEvidence, EvidenceSection
Field definitions:
- StudyID: The citation shorthand (e.g., Smith2022).
- ItemCode: The rubric item (e.g., A1, A2a, B5). Always include the full code for sub-items. 
- Score: 1, 0, N/A, or "Not assessed". 
- QuotedEvidence: The exact text excerpt from the study supporting the score.
- EvidenceSection: The section or subsection where the quote was found (e.g., Methods - Instrument Selection).
Formatting rules:
- Each field separated by one tab character (\t).
- If no evidence is found, write "Not reported" in QuotedEvidence and "Not reported" in EvidenceSection.
- No line breaks or extra tabs within any field.
- Create a separate line for each rubric item (i.e., 32 lines per study). 
- Always output 32 lines per study in Table 2, even if some items are N/A.
- Do not include column headers unless explicitly requested.
- Output must be plain UTF-8 text without BOM.

Additional Instructions
- If StudyID contains punctuation (e.g., O’Connor), retain it as-is.
- If the study includes multiple MR methodologies (e.g., one-sample MR and two-sample MR), create separate sets of outputs for each methodology (i.e., separate scoring and evidence tables), and append a suffix to StudyID (e.g., Smith2022_OneSampleMR, Smith2022_TwoSampleMR).
- Naming convention for StudyID: Use first author surname and year (e.g., Nguyen2021). If needed, append letters to disambiguate (e.g., Nguyen2021a, Nguyen2021b).
- Important: Do not summarize or interpret evidence—only quote exact text.

Handling Missing or Uncertainty Information
If information is not explicitly reported anywhere in the paper, do not infer. Instead:
- Assign the score 0 or N/A as appropriate.
- Write "Not reported" in QuotedEvidence and EvidenceSection.
If you find text but remain uncertain how to assign 1, 0, or N/A after consulting the rubric, do not guess. Instead:
- Assign the score "Not assessed".
- Quote the relevant text in Table 2 to support later review. 

Scoring Rubric Reference
The detailed criteria for scoring each item are provided in the file named: "Rubric_20250717.docx".  This file is uploaded in the Files section of this project. You must refer to this rubric when assigning scores and quoting evidence.

**Output Format:**  
Return only a single JSON object with the following structure:  

{
  "study_id": "Smith2022",  
  "scoring": {  
    "A1": "1",  
    "A2a": "0",  
    "A2b": "1",  
    ... (all rubric items A1–A12, B1–B12 with values 1, 0, N/A, or 'Not assessed')  
  },  
  "evidence": {  
    "A1": {  
      "score": "1",  
      "quote": "Exact quoted evidence from the paper supporting A1 score.",  
      "section": "Methods - Data Sources"  
    },  
    "A2a": {  
      "score": "0",  
      "quote": "Not reported",  
      "section": "Not reported"  
    },  
    ... (repeat for all rubric items A1–A12, B1–B12)  
  }  
}

**Important:**  
- Ensure the JSON is valid and can be parsed directly.  
- Do not include any additional text, explanations, or formatting outside of the JSON object.  
- For missing or unclear information, set the score to `"Not assessed"` and set `"quote"` and `"section"` to `"Not reported"`.  

Use the citation shorthand (first author + year, e.g., Smith2022) as the `study_id`.
"""

def extract_json_objects(text):
    # Try to load full string as a list of dicts
    try:
        data = json.loads(text)
        if isinstance(data, list):
            return data[0]
        else:
            return data
    except json.JSONDecodeError:
        print("Response:")  # Print first 500 chars for debugging
        print(text[:500])
        pass  # Try regex fallback below

    # Fallback regex in case it's not clean JSON
    pattern = r"\{(?:[^{}]|(?R))*\}"  # recursively match nested {}
    matches = regex.findall(pattern, text)

    objects = []
    for m in matches:
        try:
            obj = json.loads(m)
            objects.append(obj)
        except json.JSONDecodeError:
            continue

    return objects[0] if objects else None


# Set your OpenAI API key
openai.api_key = OPENAI_API_KEY  # Ensure this environment variable is set

# Paths
pdf_dir = "./documents"
output_csv = "chatgpt_new_extracted_mr_data.csv"

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=openai.api_key,
)

# Upload the Word file (rubric)
with open("./Rubric_20250717.docx", "rb") as f:
    rubric_file = client.files.create(file=f, purpose="assistants")

# Create assistant WITHOUT attaching rubric file
assistant = client.beta.assistants.create(
    name="MR Study Extractor",
    instructions="You are a genetic epidemiologist and Methods Audit reviewer. Refer to the uploaded rubric for all scoring and extraction instructions.",
    model="gpt-4-turbo",
    tools=[{"type": "file_search"}],
)

# Create directory for responses
RESPONSE_DIR = "./responses"
os.makedirs(RESPONSE_DIR, exist_ok=True)


class GlobalDebug:
    pass

import random

def retry_with_backoff(func, *args, **kwargs):
    max_retries = 5
    delay = 10  # Start with 10 seconds
    for attempt in range(max_retries):
        try:
            return func(*args, **kwargs)
        except openai.RateLimitError as e:
            wait_time = getattr(e, "retry_after", delay)
            print(f"⚠️ Rate limit hit: retrying in {wait_time:.2f}s...")
            time.sleep(wait_time + random.uniform(0, 2))  # jitter
            delay = min(delay * 2, 60)  # exponential backoff, max 60s
        except openai.APIError as e:
            print(f"⚠️ API error: {e}. Retrying in {delay}s...")
            time.sleep(delay + random.uniform(0, 2))
            delay = min(delay * 2, 60)
        except Exception as e:
            print(f"❌ Unexpected error: {e}")
            raise
    raise RuntimeError("❌ Max retries exceeded.")


def process_pdf(file_path, index):
    file_name = os.path.basename(file_path)
    print(f"\n🔍 Processing [{index}] {file_name}")

    # Upload the PDF
    try:
        with open(file_path, "rb") as f:
            uploaded_pdf = client.files.create(file=f, purpose="assistants")
    except Exception as e:
        print(f"[ERROR][{index}] Failed to upload file {file_name}:\n{e}")
        return None

    # Create a thread
    try:
        thread = client.beta.threads.create()
    except Exception as e:
        print(f"[ERROR][{index}] Failed to create thread for {file_name}:\n{e}")
        return None

    def run_prompt(prompt, label):
        retry_with_backoff(
            client.beta.threads.messages.create,
            thread_id=thread.id,
            role="user",
            content=prompt,
            attachments=[
                {"file_id": uploaded_pdf.id, "tools": [{"type": "file_search"}]},
                {"file_id": rubric_file.id, "tools": [{"type": "file_search"}]},
            ],
        )

        run = retry_with_backoff(
            client.beta.threads.runs.create,
            thread_id=thread.id,
            assistant_id=assistant.id,
        )

        while True:
            run_status = client.beta.threads.runs.retrieve(
                thread_id=thread.id, run_id=run.id
            )

            GlobalDebug.run_status = run_status
            GlobalDebug.thread_id = thread.id
            GlobalDebug.run_id = run.id

            if run_status.status == "completed":
                break
            elif run_status.status == "failed":
                print(f"Status info: {run_status}")
                raise RuntimeError(f"Run failed on {label}")
            time.sleep(1)

        messages = client.beta.threads.messages.list(thread_id=thread.id)
        response_text = messages.data[0].content[0].text.value

        # Save raw response
        raw_path = os.path.join(RESPONSE_DIR, f"{index:03d}_{file_name}_{label}.txt")
        with open(raw_path, "w", encoding="utf-8") as f:
            f.write(response_text)

        print(response_text)

        return extract_json_objects(response_text)

    # Run prompt and get data
    data = run_prompt(
        REQUESTING_PROMPT,
        "scoring",
    )

    if not data:
        print(f"[SKIP][{index}] Skipping {file_name} due to failed prompt.")
        return None

    combined = {"study_id": file_name, **data}
    return combined


# Process all PDFs in the directory
results = []
pdf_files = sorted([f for f in os.listdir(pdf_dir) if f.endswith(".pdf")])

for i, pdf_file in enumerate(tqdm(pdf_files, desc="Processing PDFs"), 1):
    pdf_path = os.path.join(pdf_dir, pdf_file)
    result = process_pdf(pdf_path, i)
    if result:
        results.append(result)
    time.sleep(90)

# Save results to CSV
if results:
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\n✅ Saved extracted data to: {output_csv}")
else:
    print("\n⚠️ No data extracted.")

Processing PDFs:   0%|          | 0/3 [00:00<?, ?it/s]


🔍 Processing [1] #1 - Ahn 2020.pdf
```json
{
  "study_id": "Ahn2020",
  "scoring": {
    "A1": "Not assessed",
    "A2a": "Not assessed",
    "A2b": "Not assessed",
    "A2c": "Not assessed",
    "A3a": "Not assessed",
    "A3b": "Not assessed",
    "A4": "Not assessed",
    "A5": "Not assessed",
    "A6": "Not assessed",
    "A7": "Not assessed",
    "A8": "Not assessed",
    "A9": "Not assessed",
    "A10a": "Not assessed",
    "A10b": "Not assessed",
    "A11a": "Not assessed",
    "A11b": "Not assessed",
    "A12": "Not assessed",
    "B1a": "Not assessed",
    "B1b": "Not assessed",
    "B2": "Not assessed",
    "B3a": "Not assessed",
    "B3b": "Not assessed",
    "B4": "Not assessed",
    "B5": "Not assessed",
    "B6": "Not assessed",
    "B7a": "Not assessed",
    "B7b": "Not assessed",
    "B8a": "Not assessed",
    "B8b": "Not assessed",
    "B9": "Not assessed",
    "B10": "Not assessed",
    "B11": "Not assessed"
  },
  "evidence": {
    "A1": {
      "score": "Not assess

Processing PDFs:  33%|███▎      | 1/3 [02:17<04:35, 137.72s/it]


🔍 Processing [2] #3 - Aoki 2022.pdf
{
  "study_id": "Aoki2022",
  "scoring": {
    "A1": "Not assessed",
    "A2a": "Not assessed",
    "A2b": "Not assessed",
    "A2c": "Not assessed",
    "A3a": "Not assessed",
    "A3b": "Not assessed",
    "A4": "Not assessed",
    "A5": "Not assessed",
    "A6": "Not assessed",
    "A7": "Not assessed",
    "A8": "Not assessed",
    "A9": "Not assessed",
    "A10a": "Not assessed",
    "A10b": "Not assessed",
    "A11a": "Not assessed",
    "A11b": "Not assessed",
    "A12": "Not assessed",
    "B1a": "Not assessed",
    "B1b": "Not assessed",
    "B2": "Not assessed",
    "B3a": "Not assessed",
    "B3b": "Not assessed",
    "B4": "Not assessed",
    "B5": "Not assessed",
    "B6": "Not assessed",
    "B7a": "Not assessed",
    "B7b": "Not assessed",
    "B8a": "Not assessed",
    "B8b": "Not assessed",
    "B9": "Not assessed",
    "B10": "Not assessed",
    "B11": "Not assessed"
  },
  "evidence": {
    "A1": {
      "score": "Not assessed",
 

Processing PDFs:  67%|██████▋   | 2/3 [04:34<02:16, 136.99s/it]


🔍 Processing [3] #4 - AuYeung 2023.pdf
```json
{
  "study_id": "AuYeung2023",
  "scoring": {
    "A1": "1",
    "A2a": "1",
    "A2b": "1",
    "A2c": "1",
    "A3a": "1",
    "A3b": "0",
    "A4": "1",
    "A5": "1",
    "A6": "1",
    "A7": "1",
    "A8": "1",
    "A9": "1",
    "A10a": "1",
    "A10b": "1",
    "A11a": "1",
    "A11b": "0",
    "A12": "Not assessed",
    "B1a": "1",
    "B1b": "1",
    "B2": "1",
    "B3a": "1",
    "B3b": "1",
    "B4": "1",
    "B5": "1",
    "B6": "1",
    "B7a": "1",
    "B7b": "1",
    "B8a": "1",
    "B8b": "1",
    "B9": "1",
    "B10": "1",
    "B11": "1"
  },
  "evidence": {
    "A1": {
      "score": "1",
      "quote": "We conducted a MR study using genetic predictors of alanine aminotransferase (ALT), liability to NAFLD, aspartate transaminase (AST), liver MRI corrected T1 and proton density fat fraction and combined them with genome-wide association studies (GWAS) summary statistics of CVD, T2D and glycaemic traits.",
      "section": 

Processing PDFs: 100%|██████████| 3/3 [07:14<00:00, 144.73s/it]


✅ Saved extracted data to: chatgpt_new_extracted_mr_data.csv





In [2]:
df

Unnamed: 0,study_id,scoring,evidence
0,Ahn2020,"{'A1': 'Not assessed', 'A2a': 'Not assessed', ...","{'A1': {'score': 'Not assessed', 'quote': 'Not..."
1,Aoki2022,"{'A1': 'Not assessed', 'A2a': 'Not assessed', ...","{'A1': {'score': 'Not assessed', 'quote': 'Not..."
2,AuYeung2023,"{'A1': '1', 'A2a': '1', 'A2b': '1', 'A2c': '1'...","{'A1': {'score': '1', 'quote': 'We conducted a..."
