In [1]:
import pandas as pd
import secret
import json
import os
import pickle
import re

from graphrag_for_all.llm.openai import set_openai_api_key
from graphrag_for_all.llm.huggingface import set_hugging_face_token
from graphrag_for_all.llm.create import get_send_fn
from query import get_questions_by_lesion

set_openai_api_key(secret.OPENAI_API_KEY)
set_hugging_face_token(secret.HUGGINGFACE_TOKEN)
send_fn = get_send_fn(source="huggingface", model_name="meta-llama/Meta-Llama-3.1-8B-Instruct")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
MIMIC_EYE_PATH = "F:\\mimic-eye"
REFLACX_LESION_LABEL_COLS = [
    # "Fibrosis",
    # "Quality issue",
    # "Wide mediastinum",
    # "Fracture",
    # "Airway wall thickening",
    ######################
    # "Hiatal hernia",
    # "Acute fracture",
    # "Interstitial lung disease",
    # "Enlarged hilum",
    # "Abnormal mediastinal contour",
    # "High lung volume / emphysema",
    # "Pneumothorax",
    # "Lung nodule or mass",
    # "Groundglass opacity",
    ######################
    "Pulmonary edema",
    "Enlarged cardiac silhouette",
    "Consolidation",
    "Atelectasis",
    "Pleural abnormality",
    # "Support devices",
]

In [3]:
def remove_data_reports(text):
    # Remove patterns like "[Data: Reports (0, 5, 12, 18, 26)]"
    cleaned_text = re.sub(r"\[Data: Reports \(.*?\)\]", "", text)

    # Remove extra spaces that might result from removal
    # cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text).strip()
    return cleaned_text


def get_diagnosis(data, label_cols):
    diagnosis = [k for k, v in dict(data[label_cols] > 0).items() if v > 0]
    if len(diagnosis) == 0:
        return " No lesion found"

    diagnosis_str = ""
    for l in diagnosis:
        diagnosis_str += f" {l},"

    return diagnosis_str[:-1]


def get_report(
    data,
    mimic_eye_path: str = MIMIC_EYE_PATH,
    label_cols: str = REFLACX_LESION_LABEL_COLS,
    report_format=True,
):
    # reflacx_id = data['id']
    patient_id = data["subject_id"]
    study_id = data["study_id"]
    # dicom_id = data['dicom_id']
    report_path = os.path.join(
        mimic_eye_path,
        f"patient_{patient_id}",
        "CXR-DICOM",
        f"s{study_id}.txt",
    )
    with open(report_path) as f:
        report = f.read()

    report = (
        report.strip()
        .replace("FINAL REPORT\n", "")
        .replace("\n \n ", "\n")
        .replace("\n ", "")
        .strip()
    )

    age = data["age"]
    gender = "Female" if data["gender"] == "F" else "Male"
    if report_format:
        return re.sub(
            "[^0-9A-Za-z.\s\:']",
            "",
            f"{report}\nDIAGNOSED LESIONS:{get_diagnosis(data, label_cols)}.\nAGE: {age}.\nGENDER: {gender}.",
        )
    else:
        # return f"A {age} years old {gender} patient diagnosed with{get_diagnosis(data, label_cols)}. And, This patient has the radiology report: \n{report}\nThis patients is most likely to have {feature_to_name[desired_clinical_feature]} around"
        # return f"A {age} years old {gender} patient diagnosed with{get_diagnosis(data, label_cols)}. And, This patient has the radiology report: \n{report}\nThe {feature_to_name[desired_clinical_feature]} of this patient is around".replace("_", "")
        return re.sub(
            "[^0-9A-Za-z.\s\:']",
            "",
            f"A {age} years old {gender} patient diagnosed with{get_diagnosis(data, label_cols)}. And, This patient has the radiology report: \n{report}",
        )

In [4]:
def build_prior_knowledge(keyword_extraction_output):
    lesion_qa_pairs = {
        k: {q: a for q, a in zip(get_questions_by_lesion(k), v)}
        for k, v in keyword_extraction_output["responses"].items()
    }

    prior_knowledge = {}
    for lesion, q_a in lesion_qa_pairs.items():
        q_a_section = ""
        for q, a in q_a.items():
            q_a_section += f"\n#############################################\n**Question**: {q}\n**Answer**:\n{a}\n"
        lesion_content = f"## Lesion: {lesion}\n" + q_a_section
        prior_knowledge[lesion] = lesion_content
    return prior_knowledge

In [5]:
EXAMPLE_JSON_STR = json.dumps(
    {
        "Chest pain": "boolean",
        "Weight loss": "boolean",
        "History of COPD": "boolean",
        "Heartrate": "numerical",
        "Age": "numerical",
        "Oxygen levels (%)": "numerical",
    }
)

EXAMPLE_OUTPUT_STR = json.dumps(
    {
        "Chest pain": True,
        "Weight loss": False,
        "History of COPD": True,
        "Age": 69,
        "Heartrate": 90,
        "Oxygen levels (%)": 99.0,
    }
)


def get_system_message(prior_knowledge):
    system_message = f"""You are a clinical expert. With following extra knowledge in mind:\n
{prior_knowledge}

# Task you're going to perform

You will be given a report regarding a patient, and a json object with for you to sepculate the values of each attribute. 
You will return the speculated values according to the data type specified in the give JSON Object.
Please, only return the json object without additional text.
You need to speculate the value, and null value is not acceptable.


Following is an example for you:

## Report
INDICATION:  Central venous line placement.
TECHNIQUE:  Frontal chest radiograph.
COMPARISON:  Chest radiograph 12:42 today.
FINDINGS: 
A right subclavian catheter has been placed in the interim. The catheterterminates at the confluence of the brachiocephalic vein and superior venacava and if indicated could be advanced 3.7 cm for termination within thelow SVC.
There is no pleural effusion or pneumothorax. The cardiac silhouette remainsmildly enlarged. There is no focal airspace consolidation worrisome forpneumonia.
High density material is again seen in the paritally imaged colon in the leftabdomen. Cholecystectomy clips are noted. There are carotid calcificationsleft greater than right.
DIAGNOSED LESIONS: Enlarged cardiac silhouette.
AGE: 69.
GENDER: Female.

## Json Object 
{EXAMPLE_JSON_STR}

## Expected Output
{EXAMPLE_OUTPUT_STR}
"""
    return system_message

def get_prompt(report, json_object ):
    json_object_str = json.dumps(json_object)
    return f"""

IMPORTANT: You don't need to return Report and Json Object section again. You only need to return the speculated value in the required json format without additional text. You need to speculate the value, and null value is not acceptable.

## Report
{report}

## Json Object 
{json_object_str}

## Expected Output"""

In [6]:
with open("./llama3_index_results/graphrag/extracted_keywords.pkl", "rb") as f:
    keyword_extraction_output = pickle.load(f)

In [7]:
prior_knowledge = build_prior_knowledge(keyword_extraction_output)

In [8]:
# lesion_qa_pairs = {
#     k: {q: a for q, a in zip(get_questions_by_lesion(k), v)}
#     for k, v in keyword_extraction_output["responses"].items()
# }

# prior_knowledge = {}
# for lesion, q_a in lesion_qa_pairs.items():
#     q_a_section = ""
#     for q, a in q_a.items():
#         q_a_section += (
#             f"\n#############################################\n**Question**: {q}\n**Answer**:\n{a}\n"
#         )
#     lesion_content = f"## Lesion: {lesion}\n" + q_a_section
#     prior_knowledge[lesion] = lesion_content

In [9]:
top_5_lesions = [
    # "pulmonary edema",
    "enlarged cardiac silhouette",
    "pulmonary consolidation",
    "atelectasis",
    "pleural abnormality",
]
sample_df = pd.read_csv('./spreadsheets/reflacx_clinical.csv') 

In [10]:
prior_knowledge



In [11]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)


def extract_json_string(text):
    # Check if "{" and "}" each occur exactly once
    if text.count("{") == 1 and text.count("}") == 1:
        # Extract the content between "{" and "}" (including braces)
        json_match = re.search(r"\{.*\}", text, re.DOTALL)
        if json_match:
            json_str = json_match.group(0)
            return json_str
    return None


@retry(
    wait=wait_random_exponential(multiplier=1, max=60),
    stop=stop_after_attempt(10),
)
def get_and_parse_json(send_fn, input_messages, keywords):
    res = send_fn(
        input_messages,
        {},
    )

    cleaned = extract_json_string(res.output)

    if cleaned is None:
        raise ValueError(f"JSON Object can't be extracted from output: {res.output}")

    json_obj = json.loads(cleaned)

    if json_obj is None:
        raise ValueError("Retrieved JSON object is None")

    required_keywords = set(list(keywords.keys()))
    res_keywords = set(list(json_obj.keys()))
    if res_keywords != required_keywords:
        raise ValueError(f"Required keywords are [{required_keywords}], while he retrieved keywords given are [{res_keywords}].")

    return json_obj

In [12]:
# input_messages = [
#             {
#                 "role": "system",
#                 "content": get_system_message(prior_knowledge=prior_knowledge[l]),
#             },
#             {
#                 "role": "user",
#                 "content": get_prompt(report, keyword_extraction_output["keywords"][l]),
#             },
#         ]

# res = send_fn(
#         input_messages,
#         {},
#     )

In [13]:
# from graphrag_for_all.llm.huggingface import pipe
from tqdm import tqdm

In [16]:
print(get_system_message(prior_knowledge=prior_knowledge[l]))

You are a clinical expert. With following extra knowledge in mind:

## Lesion: pulmonary consolidation

#############################################
**Question**: What are the symptoms associated with pulmonary consolidation?
**Answer**:
**Symptoms Associated with Pulmonary Consolidation**

Pulmonary consolidation is a condition characterized by the accumulation of fluid, pus, or other substances in the lungs, leading to impaired gas exchange and potentially life-threatening complications. Based on the analysis of the dataset, the following symptoms are associated with pulmonary consolidation:

### Primary Symptoms

* **Breath Sounds**: Abnormal breath sounds, such as crackles or wheezes, are a common sign of pulmonary consolidation [Data: Reports (0, 5, 11, +more)].
* **Percussion Note**: A dull or decreased percussion note is often associated with pulmonary consolidation, indicating the presence of fluid or solid material in the lungs [Data: Reports (0, 10, 14, +more)].
* **Pleural 

In [18]:
print(get_prompt(report, keyword_extraction_output["keywords"][l]))



IMPORTANT: You don't need to return Report and Json Object section again. You only need to return the speculated value in the required json format without additional text. You need to speculate the value, and null value is not acceptable.

## Report
INDICATION:  yearold male with cough and fever.
COMPARISON:  .
TECHNIQUE:  Frontal and lateral chest radiographs were obtained.
FINDINGS:  No focal consolidation pleural effusion or pneumothorax is seen. Heart and mediastinal contours are within normal limits.  Lungs are againnoted to be hyperinflated.
IMPRESSION:  Stable chest radiographs without acute change.
DIAGNOSED LESIONS: No lesion found.
AGE: 58.
GENDER: Male.

## Json Object 
{"Smoking History": "boolean", "Chronic Obstructive Pulmonary Disease (COPD)": "boolean", "Heart Failure": "boolean", "Pneumonia or Respiratory Infections": "boolean", "Blood from Bronchial Tree": "boolean", "Inflammatory Exudate": "boolean", "Pus": "boolean", "Pulmonary Edema": "boolean", "Breath Sounds": 

In [14]:
for l in top_5_lesions:
    print(f"Generating values for lesion [{l}]")
    augmented = []
    for idx, row in tqdm(sample_df.iterrows()):
        report = get_report(row)
        input_messages = [
            {
                "role": "system",
                "content": get_system_message(prior_knowledge=prior_knowledge[l]),
            },
            {
                "role": "user",
                "content": get_prompt(report, keyword_extraction_output["keywords"][l]),
            },
        ]
        aug_values = get_and_parse_json(send_fn, input_messages, keyword_extraction_output["keywords"][l])
        aug_instance = dict(sample_df.iloc[0])
        aug_instance.update(
            {f"Augmented_{k}": v for k, v in aug_values.items()}
        )
        aug_instance.update(
            {
                "augmenting_prompt": json.dumps(input_messages),
                "augmenting_output": json.dumps(aug_values),
            }
        )
        augmented.append(aug_instance)
    augmented_df = pd.DataFrame(augmented)
    augmented_df.to_csv(f"{l}-augmented.csv")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


KeyboardInterrupt: 