In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from collections import OrderedDict
from aug.gpt import *

MIMIC_EYE_PATH = "F:\\mimic-eye"
pd.set_option("display.precision", 2)

In [2]:
aug_features =  [
            "temperature",
            "heartrate",
            "resprate",
            "o2sat",
            "sbp",
            "dbp",
        ]

REFLACX_LESION_LABEL_COLS = [
    # "Fibrosis",
    # "Quality issue",
    # "Wide mediastinum",
    # "Fracture",
    # "Airway wall thickening",

    ######################
    # "Hiatal hernia",
    # "Acute fracture",
    # "Interstitial lung disease",
    # "Enlarged hilum",
    # "Abnormal mediastinal contour",
    # "High lung volume / emphysema",
    # "Pneumothorax",
    # "Lung nodule or mass",
    # "Groundglass opacity",
    ######################
    "Pulmonary edema",
    "Enlarged cardiac silhouette",
    "Consolidation",
    "Atelectasis",
    "Pleural abnormality",
    # "Support devices",
]

feature_to_name_map = {
    "temperature_c": "body temperature in degrees Celsius",
    "heartrate": "heart rate in beats per minute",
    "resprate": "respiratory rate in breaths per minute",
    "o2sat": "peripheral oxygen saturation (%)",
    "sbp": "systolic blood pressure (mmHg)",
    "dbp":"diastolic blood pressure (mmHg)",
}

feature = "aug_temperature_c"

In [3]:
df = pd.read_csv("./spreadsheets/gpt4_aug_report.csv")

In [4]:
def get_prompt(
    mimic_eye_path,
    data,
    label_cols,
    report_format=False,
):
    # reflacx_id = data['id']
    patient_id = data["subject_id"]
    study_id = data["study_id"]
    # dicom_id = data['dicom_id']
    report_path = os.path.join(
        mimic_eye_path,
        f"patient_{patient_id}",
        "CXR-DICOM",
        f"s{study_id}.txt",
    )
    with open(report_path) as f:
        report = f.read()

    report = report.strip().replace("FINAL REPORT\n", "").replace("\n", "").strip()

    age = data["age"]
    gender = "Female" if data["gender"] == "F" else "Male"
    if report_format:
        return re.sub(
            "[^0-9A-Za-z.\s\:']",
            "",
            f"{report} LESIONS:{get_diagnosis(data, label_cols)}. AGE: {age}. GENDER: {gender}.",
        )
    else:
        return re.sub(
            "[^0-9A-Za-z.\s\:']",
            "",
            f"A {age} years old {gender} patient diagnosed with{get_diagnosis(data, label_cols)}. And, This patient has the radiology report: \n{report}",
        )


In [5]:
report_format = True
df['prompt'] =None
for idx, data in tqdm(df.iterrows(), total=df.shape[0]):
    prompt = get_prompt(
                MIMIC_EYE_PATH,
                data,
                REFLACX_LESION_LABEL_COLS,
                report_format=report_format,
            )

    df.at[idx, 'prompt'] = prompt

100%|██████████| 799/799 [00:02<00:00, 366.42it/s] 


In [6]:
# word
# clinical features*1
# features values

df[feature].value_counts()

aug_temperature_c
37.5    627
36.5    109
38.5     38
37.2     14
37.0      8
38.9      1
38.4      1
38.3      1
Name: count, dtype: int64

In [7]:
# value = list(df[f'aug_temperature_c'].value_counts().keys())[:3]
unique_values = list(df[feature].value_counts().keys())[:3]

In [8]:
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')
sws = list(set(stopwords.words("english")))
titles = [
    "",
    "GENDER:",
    "AGE:",
    "LESIONS:",
    "IMPRESSION:",
    "FINDINGS:",
    "COMPARISON:",
    ".",
    "TECHNIQUE:",
    "INDICATION:",
]

titles = [
    "",
    "GENDER",
    "AGE",
    "LESIONS",
    "IMPRESSION",
    "FINDINGS",
    "COMPARISON",
    ".",
    "TECHNIQUE",
    "INDICATION",
]

to_remove = lambda k: (k.lower() in sws) or (k in titles) or (len(k) <= 3)


def get_word_list(df):
    s = ""

    for i, d in tqdm(
        df.iterrows(),
        total=df.shape[0],
        desc="observing tokens\t",
    ):
        s += d["prompt"]

    words = s.replace(".", " ").replace(":", " ").split(" ")
    words = set(words)

    # remove stop words
    words = [w for w in words if not to_remove(w)]
    words = set([w.lower() for w in words])
    return words


def get_percentage_dict(df):
    s = ""

    for i, d in tqdm(
        df.iterrows(),
        total=df.shape[0],
        desc="observing tokens\t",
    ):
        s += d["prompt"]

    words = s.replace(".", "").replace(":", "").split(" ")
    words = set(words)

    # remove stop words
    words = [w for w in words if not to_remove(w)]
    words = set([w.lower() for w in words])

    count_dict = {w: 0 for w in words}

    for w in tqdm(words, desc="counting tokens\t\t"):
        for i, d in df.iterrows():
            if w in d["prompt"].lower().split(" "):
                count_dict[w] += 1

    df_len = len(df)
    sorted_dict = OrderedDict(
        {
            k: v / df_len
            for k, v in reversed(sorted(count_dict.items(), key=lambda item: item[1]))
        }
    )
    return sorted_dict


def get_top_n(sorted_dict, n):
    return OrderedDict({k: sorted_dict[k] for k in list(sorted_dict.keys())[:n]})
    # return [(k, v) for k, v in sorted_dict.items()][:n]

# for each word (token) here. we then count how many times it occurs in each document

In [9]:
words = get_word_list(df)

observing tokens	: 100%|██████████| 799/799 [00:00<00:00, 59298.40it/s]


In [10]:
len(words)

1874

In [11]:
# clinical value
# words
word_uv_count = { w: { u_v: 0 for u_v in unique_values} for w in words}
# word_value_count = { w: { u_v: 0 for u_v in unique_values} for w in words}
for i, d in tqdm(df.iterrows(), total=df.shape[0], desc="counting words for unique values..."):
    for w in words:
        if w in d['prompt'].lower().replace(".", "").replace(":", "").split(" "):
            if d[feature] in word_uv_count[w]:
                word_uv_count[w][d[feature]] +=1


counting words for unique values...: 100%|██████████| 799/799 [00:06<00:00, 117.35it/s]


In [12]:
# count_thres = 10

In [13]:
# all_totals = {}
# percentage_map = {}
# for w, u_v_c in word_uv_count.items():
#     total = sum(list(u_v_c.values()))
#     all_totals[w] = total
#     if total> count_thres:
#         if total == 0:
#             percentage_map[w] = { u_v: 0  for u_v, c in u_v_c.items()}
#         else:
#             percentage_map[w] = { u_v: c/total  for u_v, c in u_v_c.items()}

In [14]:
# import numpy as np
# np.array(list(all_totals.values())).min(), np.array(list(all_totals.values())).max(), np.array(list(all_totals.values())).mean()

In [15]:
# uv_word_percentage = {}

# for w, u_v_p in tqdm(percentage_map.items()):
#     for u_v, p in u_v_p.items():
#         if not u_v in uv_word_percentage:
#             uv_word_percentage[u_v] = {}
#         uv_word_percentage[u_v][w] = p

In [16]:
from collections import OrderedDict

In [17]:
def sort_dict(d):
    return OrderedDict(sorted(d.items(), key=lambda x:x[1], reverse=True))

In [18]:
# sorted_uv = sort_dict(uv_word_percentage[36.5])
# uv_word_percentage[36.5]

In [19]:
word_uv_count['fever']

{37.5: 38, 36.5: 0, 38.5: 35}

In [20]:
count_thres = 20
word_uv_count = { w: uv_c for w, uv_c in word_uv_count.items() if sum(list(uv_c.values())) > count_thres}

In [21]:
value_counts_dict = dict(df[feature].value_counts())

In [22]:
word_uv_percentage = {
    w: {u_v: c / value_counts_dict[u_v] for u_v, c in u_v_c.items()}
    for w, u_v_c in word_uv_count.items()
}

In [23]:
word_uv_percentage['fever']

{37.5: 0.06060606060606061, 36.5: 0.0, 38.5: 0.9210526315789473}

In [24]:
value_counts_dict

{37.5: 627, 36.5: 109, 38.5: 38, 37.2: 14, 37.0: 8, 38.9: 1, 38.4: 1, 38.3: 1}

In [25]:
# all_totals = {}
percentage_map = {}
for w, u_v_c in word_uv_percentage.items():
    total = sum(list(u_v_c.values()))
    # all_totals[w] = total
    # if total> count_thres:
    if total == 0:
        percentage_map[w] = { u_v: 0  for u_v, c in u_v_c.items()}
    else:
        percentage_map[w] = { u_v: c/total  for u_v, c in u_v_c.items()}

In [26]:
uv_word_percentage = {}
for w, u_v_p in tqdm(percentage_map.items()):
    for u_v, p in u_v_p.items():
        if not u_v in uv_word_percentage:
            uv_word_percentage[u_v] = {}
        uv_word_percentage[u_v][w] = p

100%|██████████| 269/269 [00:00<?, ?it/s]


In [27]:
word_uv_percentage['clear']

{37.5: 0.4082934609250399, 36.5: 0.7155963302752294, 38.5: 0.2894736842105263}

In [28]:
percentage_map['clear']

{37.5: 0.2888807217876979, 36.5: 0.5063073602261022, 38.5: 0.2048119179861999}

In [29]:
uv_word_percentage[38.5]['none']

0.16631987793036482

In [30]:
uv_word_percentage[36.5]['none']

0.4783603828547649

In [31]:
sort_dict(uv_word_percentage[36.5])

OrderedDict([('degenerative', 0.6874420173737034),
             ('displaced', 0.6425497028079524),
             ('along', 0.6331201615617638),
             ('question', 0.6247848537005164),
             ('fractures', 0.6150676868746322),
             ('bases', 0.6134279075455545),
             ('thoracic', 0.6126854401705606),
             ('available', 0.6107142857142858),
             ('fall', 0.6000957090445047),
             ('spine', 0.596637766969761),
             ('terminating', 0.5898400752587017),
             ('disease', 0.5760573301074939),
             ('please', 0.5703456640388114),
             ('pain', 0.5671454100205175),
             ('ventricle', 0.5610738255033557),
             ('anterior', 0.545861575772651),
             ('cardiopulmonary', 0.5408513880702158),
             ('changes', 0.5320720810662407),
             ('vascularity', 0.5228267667292058),
             ('visualized', 0.5198735554749444),
             ('found', 0.5177844629269156),
             ('l

In [37]:
sd = sort_dict(uv_word_percentage[38.5])

In [46]:
pd.DataFrame([{"token": k,"formula-here": v} for k, v in  get_top_n(sd, 40).items()]).to_csv("study-percentage.csv")

In [33]:
# uv_word_count = { w: { u_v: 0 for u_v in unique_values} for w in words}

In [34]:
# # clinical value
# # words
# uv_word_count = { u_v: { w: 0 for w in words} for u_v in unique_values}
# # word_value_count = { w: { u_v: 0 for u_v in unique_values} for w in words}
# for i, d in tqdm(df.iterrows(), total=df.shape[0], desc="counting words for unique values..."):
#     for w in words:
#         if w in d['prompt'].lower().split(" "):
#             uv_word_count[d[feature]][w] +=1

# uv_word_count