In [None]:
import os
import time
import json
import openai
import pandas as pd
from tqdm import tqdm
from secret import OPENAI_API_KEY

import re
def extract_json_objects(text):
    # Try to load full string as a list of dicts
    try:
        data = json.loads(text)
        if isinstance(data, list):
            return data[0]
        else:
            return data
    except json.JSONDecodeError:
        pass  # Try regex fallback below

    # Fallback regex in case it's not clean JSON
    pattern = r'\{(?:[^{}]|(?R))*\}'  # recursively match nested {}
    matches = re.findall(pattern, text)

    objects = []
    for m in matches:
        try:
            obj = json.loads(m)
            objects.append(obj)
        except json.JSONDecodeError:
            continue

    return objects[0]

# Set your OpenAI API key
openai.api_key = OPENAI_API_KEY  # Ensure this environment variable is set

# Paths
pdf_dir = "./documents"
prompt_path = "./templates/example.txt"
output_csv = "chatgpt_extracted_mr_data.csv"

# Load base prompt
with open(prompt_path, "r", encoding="utf-8") as f:
    base_prompt = f.read()

# Initialize OpenAI client
client = openai.OpenAI(
    api_key=openai.api_key,
)

# Create an assistant with file search capability
assistant = client.beta.assistants.create(
    name="MR Study Extractor",
    instructions=base_prompt,
    model="gpt-4-turbo",
    tools=[{"type": "file_search"}],
)


# Function to process a single PDF file
def process_pdf(file_path):
    file_name = os.path.basename(file_path)
    
    # Upload the PDF file
    with open(file_path, "rb") as f:
        uploaded_file = client.files.create(file=f, purpose="assistants")

    # Create a new thread
    thread = client.beta.threads.create()

    # Add a message to the thread with the uploaded file
    client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content="Please extract the answers to Q1–Q20 from the attached MR study.",
        attachments=[
            {
                "file_id": uploaded_file.id,
                "tools": [{"type": "file_search"}]
            }
        ]
    )

    # Run the assistant
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=assistant.id,
    )

    # Wait for the run to complete
    while True:
        run_status = client.beta.threads.runs.retrieve(
            thread_id=thread.id, run_id=run.id
        )
        if run_status.status == "completed":
            break
        elif run_status.status == "failed":
            print(f"[ERROR] Run failed for {file_name}")
            return None
        time.sleep(1)

    # Retrieve the messages
    messages = client.beta.threads.messages.list(thread_id=thread.id)
    response = messages.data[0].content[0].text.value

    # Parse the JSON response

    data = extract_json_objects(response)
    data["source_file"] = file_name
    return data


# Process all PDFs in the directory
results = []
pdf_files = sorted([f for f in os.listdir(pdf_dir) if f.endswith(".pdf")])

for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    pdf_path = os.path.join(pdf_dir, pdf_file)
    result = process_pdf(pdf_path)
    if result:
        results.append(result)

# Save results to CSV
if results:
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\n✅ Saved extracted data to: {output_csv}")
else:
    print("\n⚠️ No data extracted.")

Processing PDFs:  33%|███▎      | 1/3 [00:28<00:57, 28.89s/it]

{'Q1': 'Ahn 2020', 'Q2': 'Causal Inference between Rheumatoid Arthritis and Breast Cancer in East Asian and European Population: A Two-Sample Mendelian Randomization', 'Q3': 'Cancers', 'Q4': '2020', 'Q5': 'Two-sample MR', 'Q6': 'SNPs from six genome-wide association studies (GWAS) targeting RA in an East Asian population, and GWAS-summary results from the BBJ, BCAC, and CIMBA for breast cancer', 'Q7': '25', 'Q8': 'Rheumatoid Arthritis', 'Q9': 'Breast Cancer', 'Q10': '1', 'Q11': '1', 'Q12': 'Six different GWAS targeting RA in an East Asian population', 'Q13': 'BBJ (BioBank Japan), BCAC (Breast Cancer Association Consortium), and CIMBA (Consortium of Investigators of Modifiers of BRCA1/2)', 'Q14': 'The East Asian RA GWAS included six different studies with sample sizes ranging from 100 to 7907 RA cases and controls varying proportionally .', 'Q15': 'BCAC: 122,977 cases and 105,974 controls; CIMBA: 15,252 BRCA1 and 8211 BRCA2 carriers', 'Q16': 'East Asian (Korean and Japanese)【4:6†source】

Processing PDFs:  67%|██████▋   | 2/3 [01:03<00:32, 32.01s/it]

{'Q1': 'Aoki 2022', 'Q2': 'Shared genetic components between metabolic syndrome and schizophrenia: Genetic correlation using multipopulation data sets', 'Q3': 'Psychiatry and Clinical Neurosciences', 'Q4': '2022', 'Q5': 'Two-sample MR', 'Q6': 'SNPs involved in the 1000 Genomes Project and HapMap Projects, with a minor allele frequency ≥ 1%, and INFO ≥0.9', 'Q7': 'NA', 'Q8': 'Metabolic syndrome traits (BMI, HDL-C, blood sugar, triglycerides, systolic blood pressure, CRP)', 'Q9': 'Schizophrenia (SCZ)', 'Q10': '1', 'Q11': '1', 'Q12': 'JENGER database provided by Biobank Japan', 'Q13': 'PGC for EUR samples, PGC Asia for EAS samples', 'Q14': 'EAS: 24,504 (SCZ patients) and 42,770 (controls), EUR: Varies with trait, up to 361,194 samples', 'Q15': 'EAS: 24,504 (SCZ patients) and 42,770 (controls), EUR: Varies with trait, up to 361,194 samples', 'Q16': 'East Asian', 'Q17': 'European', 'Q18': 'Yes', 'Q19': 'East Asia, Europe', 'Q20': 'TwoSample MR, LDSC software'}


Processing PDFs: 100%|██████████| 3/3 [01:45<00:00, 35.14s/it]

{'Q1': 'AuYeung 2023', 'Q2': 'Evaluating the role of non-alcoholic fatty liver disease in cardiovascular diseases and type 2 diabetes: a Mendelian randomization study in Europeans and East Asians', 'Q3': 'International Journal of Epidemiology', 'Q4': '2023', 'Q5': 'Two-sample MR', 'Q6': 'Genetic predictors of ALT, liability to NAFLD, AST, liver MRI cT1, proton density fat fraction, 195 ALT instruments, 228 AST instruments【4:9†source】', 'Q7': 'ALT: 195, AST: 228, cT1: 5, PDFF: 4【4:9†source】', 'Q8': 'ALT (alanine aminotransferase), liability to NAFLD (non-alcoholic fatty liver disease), AST (aspartate transaminase), liver MRI cT1, proton density fat fraction【4:0†source】', 'Q9': 'Cardiovascular diseases (CVD), type 2 diabetes (T2D), coronary artery disease (CAD) stroke, atrial fibrillation, heart failure【4:6†source】', 'Q10': '5', 'Q11': '6', 'Q12': 'Neale Lab in the UK Biobank, Million Veteran Program, UK Biobank for cT1 and PDFF【4:4†source】', 'Q13': 'DIAMANTE consortium, MAGIC, CARDIoGRA


