In [1]:
import pandas as pd
import secret
import json
import os
import pickle
import re

from graphrag_for_all.llm.openai import set_openai_api_key
from graphrag_for_all.llm.huggingface import set_hugging_face_token
from graphrag_for_all.llm.create import get_send_fn
from utils.query import get_questions_by_lesion
from graphrag_for_all.search.searcher import Searcher

set_openai_api_key(secret.OPENAI_API_KEY)
set_hugging_face_token(secret.HUGGINGFACE_TOKEN)
send_fn = get_send_fn(source="huggingface", model_name="meta-llama/Meta-Llama-3.1-8B-Instruct")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
with open("combined_results", "rb") as f:
    combined_results = pickle.load(f)

In [3]:
with open("./combined_index_results/graphrag/index_graphrag_llama3v1_combined_top_1/combined_extracted_keywords.pkl", "rb") as f:
    extracted_keywords_results = pickle.load(f)

In [4]:
MIMIC_EYE_PATH = "F:\\mimic-eye"
REFLACX_LESION_LABEL_COLS = [
    # "Fibrosis",
    # "Quality issue",
    # "Wide mediastinum",
    # "Fracture",
    # "Airway wall thickening",
    ######################
    # "Hiatal hernia",
    # "Acute fracture",
    # "Interstitial lung disease",
    # "Enlarged hilum",
    # "Abnormal mediastinal contour",
    # "High lung volume / emphysema",
    # "Pneumothorax",
    # "Lung nodule or mass",
    # "Groundglass opacity",
    ######################
    "Pulmonary edema",
    "Enlarged cardiac silhouette",
    "Consolidation",
    "Atelectasis",
    "Pleural abnormality",
    # "Support devices",
]

In [6]:
"./combined_index_results/graphrag/index_graphrag_llama3v1_combined_top_1/combined_extracted_keywords.pkl"

# knowledge_graph_dir = os.path.join(
#     /combined_extracted_keywords.pkl

#     f"./{args.output_dir}", args.store_type, f"/{lesion}_top_{args.doc_top_k}"

# )


searcher = Searcher(
    input_dir="./combined_index_results/graphrag/index_graphrag_llama3v1_combined_top_1/",
    send_to=send_fn,
    community_level=1,

)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


In [13]:
def remove_data_reports(text):
    # Remove patterns like "[Data: Reports (0, 5, 12, 18, 26)]"
    cleaned_text = re.sub(r"\[Data: Reports \(.*?\)\]", "", text)

    # Remove extra spaces that might result from removal
    # cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text).strip()
    return cleaned_text


def get_diagnosis(data, label_cols):
    diagnosis = [k for k, v in dict(data[label_cols] > 0).items() if v > 0]
    if len(diagnosis) == 0:
        return " No lesion found"

    diagnosis_str = ""
    for l in diagnosis:
        diagnosis_str += f" {l},"

    return diagnosis_str[:-1]


def get_report(
    data,
    mimic_eye_path: str = MIMIC_EYE_PATH,
    label_cols: str = REFLACX_LESION_LABEL_COLS,
    report_format=True,
):
    
    # TODO: Add existing clinical features into the report.

    # reflacx_id = data['id']
    patient_id = data["subject_id"]
    study_id = data["study_id"]
    # dicom_id = data['dicom_id']
    report_path = os.path.join(
        mimic_eye_path,
        f"patient_{patient_id}",
        "CXR-DICOM",
        f"s{study_id}.txt",
    )
    with open(report_path) as f:
        report = f.read()

    report = (
        report.strip()
        .replace("FINAL REPORT\n", "")
        .replace("\n \n ", "\n")
        .replace("\n ", "")
        .strip()
    )

    age = data["age"]
    gender = "Female" if data["gender"] == "F" else "Male"
    if report_format:
        return re.sub(
            "[^0-9A-Za-z.\s\:']",
            "",
            f"{report}\nDIAGNOSED LESIONS:{get_diagnosis(data, label_cols)}.\nAGE: {age}.\nGENDER: {gender}.",
        )
    else:
        # return f"A {age} years old {gender} patient diagnosed with{get_diagnosis(data, label_cols)}. And, This patient has the radiology report: \n{report}\nThis patients is most likely to have {feature_to_name[desired_clinical_feature]} around"
        # return f"A {age} years old {gender} patient diagnosed with{get_diagnosis(data, label_cols)}. And, This patient has the radiology report: \n{report}\nThe {feature_to_name[desired_clinical_feature]} of this patient is around".replace("_", "")
        return re.sub(
            "[^0-9A-Za-z.\s\:']",
            "",
            f"A {age} years old {gender} patient diagnosed with{get_diagnosis(data, label_cols)}. And, This patient has the radiology report: \n{report}",
        )


from collections import OrderedDict
def build_prior_knowledge(extracted_keywords_results):
    lesion_qa_pairs = extracted_keywords_results['responses']

    prior_knowledge = OrderedDict({})
    for lesion, q_a in lesion_qa_pairs.items():
        q_a_section = ""
        for q, a in q_a.items():
            q_a_section += f"\n#############################################\n**Question**: {q}\n**Answer**:\n{a}\n"
        lesion_content = f"## Lesion: {lesion}\n" + q_a_section
        prior_knowledge[lesion] = lesion_content
    return prior_knowledge

In [14]:
prior_knowledge = build_prior_knowledge(extracted_keywords_results)

In [19]:
combined_prior_knowledge = "\n\n\n".join(prior_knowledge.values())

In [20]:
top_5_lesions = [
    # "pulmonary edema",
    "enlarged cardiac silhouette",
    "pulmonary consolidation",
    "atelectasis",
    "pleural abnormality",
]
sample_df = pd.read_csv('./spreadsheets/reflacx_clinical.csv') 

In [22]:
extracted_keywords_results['responses']

OrderedDict([('enlarged cardiac silhouette',
              OrderedDict([('What are the symptoms associated with enlarged cardiac silhouette?',
                           ('What can cause enlarged cardiac silhouette?',
                           ('What are the patient’s symptoms that are relevant for enlarged cardiac silhouette?',
                           ('What are the relevant clinical signs for the etiological diagnosis of enlarged cardiac silhouette?',
                           ('What are the relevant laboratory data for the etiological diagnosis of enlarged cardiac silhouette?',
                           ('What are the relevant clinical characteristics for the etiological diagnosis of enlarged cardiac silhouette?',
                           ('What are the patient’s personal relevant history for the etiological diagnosis of enlarged cardiac silhouette?',
             ('pulmonary consolidation',
              OrderedDict([('What are the symptoms associated with pulmonary consoli