In [1]:
import json
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel

load_dotenv()
client = OpenAI()
annotations = json.load(open("data/annotation_quiz_all.json"))

In [2]:
class RadiologyReport(BaseModel):
    lung: str
    heart: str
    mediastinal: str
    bone: str
    others: str

In [3]:
system_prompt = """
You are an expert at structured data extraction and medicine.
You will be given unstructured findings of a x-ray radiology report and should separate the findings into four predefined anatomical regions: lung, heart, mediastinal, and bone.
If you cannot assign the sentence to any anatomical region, put it in others.
""".strip()

In [4]:
# Inference loop over all examples in the validation set
for i, data in tqdm(
    enumerate(annotations['val']),
    total=len(annotations['val']),
    desc='Processing',
    unit='examples'
    ):

    # Parse the radiology report
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": data['original_report']},
        ],
        response_format=RadiologyReport,
    )

    # Save the parsed report
    output = completion.choices[0].message.parsed
    annotations['val'][i]['report'] = output.__dict__

Processing: 100%|██████████| 296/296 [06:08<00:00,  1.24s/examples]


In [5]:
# Save annotations to disk
json.dump(annotations, open("data/annotation_quiz_all_with_val.json", "w"), indent=2)

In [6]:
# Sample view of the parsed reports
index = 23
print(annotations['val'][index]['original_report'], '\n\n')
print(annotations['val'][index]['report'])

The XXXX examination consists of frontal and lateral radiographs of the chest. There has been interval CABG. Surgical clips are again seen in the epigastric region. The cardiomediastinal contours are within normal limits. Pulmonary vascularity is within normal limits. No focal consolidation, pleural effusion, or pneumothorax identified. The visualized osseous structures are unremarkable. 


{'lung': 'Pulmonary vascularity is within normal limits. No focal consolidation, pleural effusion, or pneumothorax identified.', 'heart': 'There has been interval CABG. The cardiomediastinal contours are within normal limits.', 'mediastinal': 'The cardiomediastinal contours are within normal limits.', 'bone': 'The visualized osseous structures are unremarkable.', 'others': 'Surgical clips are again seen in the epigastric region.'}
