In [1]:
import json

# Load the JSON file
with open('/content/VQA_RAD Dataset Public.json', 'r') as file:
    data = json.load(file)

# Extract all URLs
urls = [item.get('image_case_url') for item in data if 'image_case_url' in item]
image_names = [item.get('image_name').split(".jpg")[0] for item in data if 'image_name' in item]

# Print the extracted URLs
print(urls)
print(len(urls))

print(image_names)
print(len(image_names))

['https://medpix.nlm.nih.gov/case?id=48e1dd0e-8552-46ad-a354-5eb55be86de6', 'https://medpix.nlm.nih.gov/case?id=b197277b-6960-4175-86ee-d2cb23e381b3', 'https://medpix.nlm.nih.gov/case?id=b197277b-6960-4175-86ee-d2cb23e381b3', 'https://medpix.nlm.nih.gov/case?id=19aa8a2b-35fb-4d90-973d-ccc3859df66e', 'https://medpix.nlm.nih.gov/case?id=b197277b-6960-4175-86ee-d2cb23e381b3', 'https://medpix.nlm.nih.gov/case?id=19aa8a2b-35fb-4d90-973d-ccc3859df66e', 'https://medpix.nlm.nih.gov/case?id=19aa8a2b-35fb-4d90-973d-ccc3859df66e', 'https://medpix.nlm.nih.gov/case?id=19aa8a2b-35fb-4d90-973d-ccc3859df66e', 'https://medpix.nlm.nih.gov/case?id=27003b0c-e836-4860-ad66-e29c9bba19e6', 'https://medpix.nlm.nih.gov/case?id=19aa8a2b-35fb-4d90-973d-ccc3859df66e', 'https://medpix.nlm.nih.gov/case?id=31ac9c0f-732a-4c51-9f33-1cdfae1b8728', 'https://medpix.nlm.nih.gov/case?id=31ac9c0f-732a-4c51-9f33-1cdfae1b8728', 'https://medpix.nlm.nih.gov/case?id=b197277b-6960-4175-86ee-d2cb23e381b3', 'https://medpix.nlm.nih.

# Fetch Radiological Reports
Scraping data from web pages that rely on JavaScript to display content, ensuring that the script waits for the page to fully load before attempting to extract the required information.

In [2]:
!pip install requests-html




In [3]:
from requests_html import AsyncHTMLSession

# Initialize an asynchronous HTML session
asession = AsyncHTMLSession()

# Define an asynchronous function to perform the request and render the JavaScript
async def get_findings():
    # URL of the webpage
    for image_name, url in zip(image_names, urls):

        text_to_save = ""

        # Send a GET request to the webpage
        response = await asession.get(url)

        # Render JavaScript (this step is crucial for pages that load content dynamically)
        await response.html.arender(timeout=20)

        # Locate the sections using XPath
        history_section = response.html.xpath("//*[contains(text(), 'History')]/following-sibling::div", first=True)
        exam_section = response.html.xpath("//*[contains(text(), 'Exam')]/following-sibling::div", first=True)
        findings_section = response.html.xpath("//*[contains(text(), 'Findings')]/following-sibling::div", first=True)
        differential_diagnosis_section = response.html.xpath("//*[contains(text(), 'Differential Diagnosis')]/following-sibling::div", first=True)
        case_diagnosis_section = response.html.xpath("//*[contains(text(), 'Case Diagnosis')]/following-sibling::div", first=True)
        diagnosis_by_section = response.html.xpath("//*[contains(text(), 'Diagnosis by')]/following-sibling::div", first=True)
        treatment_section = response.html.xpath("//*[contains(text(), 'Treatment & Follow Up')]/following-sibling::div", first=True)


        if history_section:
            # Extract and print the text of the History section
            history_text = history_section.text
            text_to_save += "\nHistory:\n"
            text_to_save += history_text
            print("History:")
            print(history_text)
        if exam_section:
            # Extract and print the text of the Exam section
            exam_text = exam_section.text
            text_to_save += "\nExam:\n"
            text_to_save += exam_text
            print("Exam:")
            print(exam_text)
        if findings_section:
            # Extract and print the text of the Findings section
            findings_text = findings_section.text
            text_to_save += "\nFindings:\n"
            text_to_save += findings_text
            print("Findings:")
            print(findings_text)
        if differential_diagnosis_section:
            # Extract and print the text of the Differential Diagnosis section
            differential_diagnosis_text = differential_diagnosis_section.text
            text_to_save += "\nDifferential Diagnosis:\n"
            text_to_save += differential_diagnosis_text
            print("Differential Diagnosis:")
            print(differential_diagnosis_text)
        if case_diagnosis_section:
            # Extract and print the text of the Case Diagnosis section
            case_diagnosis_text = case_diagnosis_section.text
            text_to_save += "\nCase Diagnosis:\n"
            text_to_save += case_diagnosis_text
            print("Case Diagnosis:")
            print(case_diagnosis_text)
        if diagnosis_by_section:
            # Extract and print the text of the Diagnosis by section
            diagnosis_by_text = diagnosis_by_section.text
            text_to_save += "\nDiagnosis by:\n"
            text_to_save += diagnosis_by_text
            print("Diagnosis by:")
            print(diagnosis_by_text)
        if treatment_section:
            # Extract and print the text of the Treatment & Follow Up section
            treatment_text = treatment_section.text
            text_to_save += "\nTreatment & Follow Up:\n"
            text_to_save += treatment_text
            print("Treatment & Follow Up:")
            print(treatment_text)

        # Specify the file name
        file_name = "/content/reports/" + image_name + ".txt"

        # Open the file in write mode and save the string
        with open(file_name, 'w') as file:
            file.write(text_to_save)

        print(f"String saved to {file_name}.")

# Run the asynchronous function
await get_findings()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
History:
27 year old man in car crash sustained right femur and pelvic fractures. Patient was awake and alert at scene of accident and complained of leg and hip pain. CT scan of brain and spine done in the ER was normal. 36 hours after surgical repair patient was noted to have altered mental status.
Exam:
36 hours after surgical repair of femur and pelvic fractures, the patient was noted to have altered mental status.
Findings:
Multiple punctate signal lesions (infarcts) in all three cerebral vascular territories
Differential Diagnosis:
• Cerebral Fat Embolism
• Hypotensive border-zone infarcts
• HT white matter changes (too young, not hypertensive)
• Demyelinating disease (MS, ADEM)
Case Diagnosis:
Fat Embolism with cerebral infarction
Treatment & Follow Up:
Supportive
String saved to /content/reports/synpic53574.txt.
History:
88 year old male presented to the ER with worsening SOB
Exam:
Decreased breast sounds in the ri

In [8]:
!zip -r reports.zip reports/

  adding: reports/ (stored 0%)
  adding: reports/synpic46539.txt (deflated 40%)
  adding: reports/synpic28210.txt (deflated 35%)
  adding: reports/synpic60423.txt (deflated 42%)
  adding: reports/synpic27985.txt (deflated 48%)
  adding: reports/synpic40314.txt (deflated 37%)
  adding: reports/synpic40464.txt (deflated 40%)
  adding: reports/synpic20260.txt (deflated 55%)
  adding: reports/synpic37605.txt (deflated 37%)
  adding: reports/synpic43433.txt (deflated 44%)
  adding: reports/synpic23989.txt (deflated 26%)
  adding: reports/synpic676.txt (deflated 41%)
  adding: reports/synpic23648.txt (deflated 37%)
  adding: reports/synpic27047.txt (deflated 40%)
  adding: reports/synpic32012.txt (deflated 42%)
  adding: reports/synpic23803.txt (deflated 38%)
  adding: reports/synpic22020.txt (deflated 48%)
  adding: reports/synpic24878.txt (deflated 40%)
  adding: reports/synpic27013.txt (deflated 40%)
  adding: reports/synpic51709.txt (deflated 39%)
  adding: reports/synpic26413.txt (defla