In [None]:
import os
import time
import json
import anthropic
import pandas as pd
from tqdm import tqdm
from secret import ANTHROPIC_API_KEY

# Initialize Anthropic client
client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

# Prompt template
with open("./templates/example.txt", "r", encoding="utf-8") as f:
    base_prompt = f.read()

# Paths
pdf_dir = "./documents"
output_csv = "claude_extracted_mr_data.csv"

# Store results
results = []

def extract_json_objects(text):
    try:
        data = json.loads(text)
        if isinstance(data, list):
            return data[0]
        else:
            return data
    except json.JSONDecodeError:
        pass

    import re
    pattern = r'\{(?:[^{}]|(?R))*\}'
    matches = re.findall(pattern, text)
    for m in matches:
        try:
            return json.loads(m)
        except json.JSONDecodeError:
            continue
    return None

def process_with_claude_file_api(pdf_path):
    file_name = os.path.basename(pdf_path)

    # Step 1: Upload file
    with open(pdf_path, "rb") as f:
        uploaded_file = client.files.create(file=f, purpose="user_file")

    # Step 2: Create user message with file attached
    user_message = f"""{base_prompt}"""

    # Step 3: Send message with file attachment
    message = client.messages.create(
        model="claude-3-sonnet-20240229",
        max_tokens=4096,
        temperature=0,
        system="You are a genetic epidemiologist and expert in Mendelian Randomization (MR).",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": user_message},
                    {"type": "file", "file_id": uploaded_file.id},
                ]
            }
        ]
    )

    content = message.content[0].text.strip()
    parsed = extract_json_objects(content)
    if parsed:
        parsed["source_file"] = file_name
        return parsed
    else:
        print(f"[⚠️] Could not parse JSON from Claude's response for {file_name}")
        return None

# Loop through PDFs
pdf_files = sorted([f for f in os.listdir(pdf_dir) if f.endswith(".pdf")])

for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    pdf_path = os.path.join(pdf_dir, pdf_file)
    result = process_with_claude_file_api(pdf_path)
    if result:
        results.append(result)

# Save as CSV
if results:
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\n✅ Saved extracted data to: {output_csv}")
else:
    print("\n⚠️ No results were saved.")
