In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from collections import OrderedDict
from aug.gpt import *

MIMIC_EYE_PATH = "F:\\mimic-eye"
pd.set_option("display.precision", 2)

In [2]:
aug_features =  [
            "temperature",
            "heartrate",
            "resprate",
            "o2sat",
            "sbp",
            "dbp",
        ]

REFLACX_LESION_LABEL_COLS = [
    # "Fibrosis",
    # "Quality issue",
    # "Wide mediastinum",
    # "Fracture",
    # "Airway wall thickening",

    ######################
    # "Hiatal hernia",
    # "Acute fracture",
    # "Interstitial lung disease",
    # "Enlarged hilum",
    # "Abnormal mediastinal contour",
    # "High lung volume / emphysema",
    # "Pneumothorax",
    # "Lung nodule or mass",
    # "Groundglass opacity",
    ######################
    "Pulmonary edema",
    "Enlarged cardiac silhouette",
    "Consolidation",
    "Atelectasis",
    "Pleural abnormality",
    # "Support devices",
]

feature_to_name_map = {
    "temperature_c": "body temperature in degrees Celsius",
    "heartrate": "heart rate in beats per minute",
    "resprate": "respiratory rate in breaths per minute",
    "o2sat": "peripheral oxygen saturation (%)",
    "sbp": "systolic blood pressure (mmHg)",
    "dbp":"diastolic blood pressure (mmHg)",
}

In [3]:
df = pd.read_csv("./spreadsheets/gpt4_aug_report.csv")

In [4]:
def get_prompt(
    mimic_eye_path,
    data,
    label_cols,
    report_format=False,
):
    # reflacx_id = data['id']
    patient_id = data["subject_id"]
    study_id = data["study_id"]
    # dicom_id = data['dicom_id']
    report_path = os.path.join(
        mimic_eye_path,
        f"patient_{patient_id}",
        "CXR-DICOM",
        f"s{study_id}.txt",
    )
    with open(report_path) as f:
        report = f.read()

    report = report.strip().replace("FINAL REPORT\n", "").replace("\n", "").strip()

    age = data["age"]
    gender = "Female" if data["gender"] == "F" else "Male"
    if report_format:
        return re.sub(
            "[^0-9A-Za-z.\s\:']",
            "",
            f"{report} LESIONS:{get_diagnosis(data, label_cols)}. AGE: {age}. GENDER: {gender}.",
        )
    else:
        return re.sub(
            "[^0-9A-Za-z.\s\:']",
            "",
            f"A {age} years old {gender} patient diagnosed with{get_diagnosis(data, label_cols)}. And, This patient has the radiology report: \n{report}",
        )


In [5]:
report_format = True
df['prompt'] =None
for idx, data in tqdm(df.iterrows(), total=df.shape[0]):
    prompt = get_prompt(
                MIMIC_EYE_PATH,
                data,
                REFLACX_LESION_LABEL_COLS,
                report_format=report_format,
            )

    df.at[idx, 'prompt'] = prompt

100%|██████████| 799/799 [00:00<00:00, 2827.36it/s]


In [6]:
df[f'aug_temperature_c'].value_counts()

aug_temperature_c
37.5    627
36.5    109
38.5     38
37.2     14
37.0      8
38.9      1
38.4      1
38.3      1
Name: count, dtype: int64

In [47]:
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')
sws = list(set(stopwords.words("english")))
titles = [
    "",
    "GENDER:",
    "AGE:",
    "LESIONS:",
    "IMPRESSION:",
    "FINDINGS:",
    "COMPARISON:",
    ".",
    "TECHNIQUE:",
    "INDICATION:",
]

titles = [
    "",
    "GENDER",
    "AGE",
    "LESIONS",
    "IMPRESSION",
    "FINDINGS",
    "COMPARISON",
    ".",
    "TECHNIQUE",
    "INDICATION",
]

to_remove = lambda k: (k.lower() in sws) or (k in titles) or (len(k) <= 3)


def get_percentage_dict(df):
    s = ""

    for i, d in tqdm(
        df.iterrows(),
        total=df.shape[0],
        desc="observing tokens\t",
    ):
        s += d["prompt"]

    words = s.replace(".", "").replace(":", "").split(" ")
    words = set(words)

    # remove stop words
    words = [w for w in words if not to_remove(w)]
    words = set([w.lower() for w in words])

    count_dict = {w: 0 for w in words}

    for w in tqdm(words, desc="counting tokens\t\t"):
        for i, d in df.iterrows():
            if w in d["prompt"].lower().split(" "):
                count_dict[w] += 1

    df_len = len(df)
    sorted_dict = OrderedDict(
        {
            k: v / df_len
            for k, v in reversed(sorted(count_dict.items(), key=lambda item: item[1]))
        }
    )
    return sorted_dict


def get_top_n(sorted_dict, n):
    return OrderedDict({k: sorted_dict[k] for k in list(sorted_dict.keys())[:n]})
    # return [(k, v) for k, v in sorted_dict.items()][:n]

In [48]:
sorted_dict_36_5 = get_percentage_dict(df[df[f"aug_temperature_c"] == 36.5])
# sorted_dict_36_5['fever']

sorted_dict_37_5 = get_percentage_dict(df[df[f"aug_temperature_c"] == 37.5])
print(sorted_dict_37_5['fever'])

sorted_dict_38_5 = get_percentage_dict(df[df[f"aug_temperature_c"] == 38.5])
print(sorted_dict_38_5['fever'])

observing tokens	: 100%|██████████| 109/109 [00:00<00:00, 54497.45it/s]
counting tokens		: 100%|██████████| 805/805 [00:01<00:00, 483.82it/s]
observing tokens	: 100%|██████████| 627/627 [00:00<00:00, 62908.54it/s]
counting tokens		: 100%|██████████| 1767/1767 [00:22<00:00, 79.94it/s]


0.03987240829346093


observing tokens	: 100%|██████████| 38/38 [00:00<00:00, 38148.29it/s]
counting tokens		: 100%|██████████| 378/378 [00:00<00:00, 1314.43it/s]

0.5





In [31]:
# sort_36_5_100 = get_top_n(sorted_dict_36_5, 100)
# sort_37_5_100 = get_top_n(sorted_dict_37_5, 100)
# sort_38_5_100 = get_top_n(sorted_dict_38_5, 100)

In [49]:
all_k = list(get_percentage_dict(df).keys())

observing tokens	: 100%|██████████| 799/799 [00:00<00:00, 69552.52it/s]
counting tokens		: 100%|██████████| 1941/1941 [00:30<00:00, 62.73it/s]


In [119]:
sorted_dict_38_5['fever']

0.5

In [120]:
sorted_dict_37_5['fever']

0.03987240829346093

In [121]:
sorted_dict_36_5['fever']

KeyError: 'fever'

In [33]:
# all_k = list(sorted_dict_38_5.keys()) + list(sorted_dict_37_5.keys()) + list(sorted_dict_36_5.keys())
# all_k = set(all_k)

In [52]:
# sorted_dict_36_5['female']

In [72]:
import numpy as np

In [74]:
np.min([abs(v_38_5 - v_37_5),abs(v_37_5 - v_36_5)])

0.0

In [135]:
follow_trend = []
inverse_trend = []

# for k in sorted_dict_38_5.keys():
for k in all_k:

    v_38_5 = sorted_dict_38_5[k] if k in sorted_dict_38_5 else 0
    v_37_5 = sorted_dict_37_5[k] if k in sorted_dict_37_5 else 0
    v_36_5 = sorted_dict_36_5[k] if k in sorted_dict_36_5 else 0

    if v_38_5 > v_37_5 and v_37_5 > v_36_5:
        follow_trend.append(
            {
                "token": k,
                38.5: v_38_5*100,
                37.5: v_37_5*100,
                36.5: v_36_5*100,
                "gap": np.min([abs(v_38_5 - v_37_5) ,abs(v_37_5 - v_36_5)]),
                "mean_gap": np.mean([abs(v_38_5 - v_37_5) ,abs(v_37_5 - v_36_5)]),

            }
        )
    elif v_38_5 < v_37_5 and v_37_5 < v_36_5:
        inverse_trend.append(
            {
                "token": k,
                38.5: v_38_5*100,
                37.5: v_37_5*100,
                36.5: v_36_5*100,
                "gap": np.min([abs(v_38_5 - v_37_5) ,abs(v_37_5 - v_36_5)]),
                "mean_gap": np.mean([abs(v_38_5 - v_37_5) ,abs(v_37_5 - v_36_5)]),
            }
        )

In [141]:
pd.options.display.float_format = '{:.2f}'.format

In [147]:
inverse_trend_df = pd.DataFrame(inverse_trend).sort_values('gap', ascending=False)
inverse_trend_df.head(20).to_csv("inverse_trend.csv")
#pneumothorax #pulmonary #clear #normal #unremarkable # cardiopulmonary

In [143]:
inverse_trend_df.head(30)

Unnamed: 0,token,38.5,37.5,36.5,gap,mean_gap
3,lesion,31.58,52.15,89.91,0.21,0.29
7,cardiopulmonary,15.79,30.94,55.05,0.15,0.2
5,views,28.95,44.5,55.96,0.11,0.14
0,lateral,63.16,73.52,83.49,0.1,0.1
12,thoracic,0.0,12.76,20.18,0.07,0.1
15,changes,2.63,8.13,15.6,0.06,0.06
20,degenerative,0.0,5.42,11.93,0.05,0.06
9,clear,13.16,17.86,33.03,0.05,0.1
21,question,0.0,6.06,10.09,0.04,0.05
24,along,0.0,4.78,8.26,0.03,0.04


In [148]:
follow_trend_df = pd.DataFrame(follow_trend).sort_values('gap', ascending=False)
follow_trend_df.head(20).to_csv("./follow_trend.csv")

In [149]:
follow_trend_df.head(20)

Unnamed: 0,token,38.5,37.5,36.5,gap,mean_gap
15,opacities,26.32,14.19,3.67,0.11,0.11
12,pneumonia,31.58,14.67,4.59,0.1,0.13
7,atelectasis,47.37,22.97,13.76,0.09,0.17
2,left,44.74,37.0,29.36,0.08,0.08
26,cardiomegaly,18.42,10.37,2.75,0.08,0.08
11,opacity,47.37,17.22,10.09,0.07,0.19
32,increased,15.79,8.13,1.83,0.06,0.07
22,mildly,18.42,10.53,4.59,0.06,0.07
20,cough,28.95,10.53,4.59,0.06,0.12
43,concerning,15.79,5.58,0.0,0.06,0.08


In [146]:
## Parr 1: count in every prompt.
# occuring_v = 38.5
# s = ""
# for i, d in tqdm(
#     df[df[f"aug_{f}"] == occuring_v].iterrows(),
#     total=df[df[f"aug_{f}"] == occuring_v].shape[0],
# ):
#     s += d["prompt"]
# from collections import Counter
# words = s.split(' ')
# result = dict(Counter(words))
# result = OrderedDict({k: v for k, v in reversed(sorted(result.items(), key=lambda item: item[1]))})
# import nltk
# from nltk.corpus import stopwords

# # nltk.download('stopwords')
# sws = list(set(stopwords.words("english")))
# titles = [
#     "",
#     "GENDER:",
#     "AGE:",
#     "LESIONS:",
#     "IMPRESSION:",
#     "FINDINGS:",
#     "COMPARISON:",
#     ".",
#     "TECHNIQUE:",
#     "INDICATION:",
# ]
# to_remove = lambda k: (k.lower() in sws) or (k in titles) or (len(k) <= 3)
# f_w = [(k, v) for k, v in result.items() if not to_remove(k)][:10]
# f_w

In [15]:
### 39.5
#  [('opacity', 31),
#  ('pleural', 30),
#  ('chest', 29),
#  ('pneumonia.', 29),
#  ('lateral', 28),
#  ('effusion', 28),
#  ('right', 28),
#  ('atelectasis', 25),
#  ('acute', 23),
#  ('pulmonary', 22)]

### 37.5
# [('pleural', 618),
#  ('chest', 573),
#  ('right', 556),
#  ('effusion', 516),
#  ('lateral', 513),
#  ('acute', 448),
#  ('PA', 392),
#  ('left', 386),
#  ('Chest', 385),
#  ('lung', 364)]

### 37.2
# [('right', 16),
#  ('pleural', 14),
#  ('lateral', 12),
#  ('atelectasis', 12),
#  ('lung', 11),
#  ('mild', 10),
#  ('PA', 10),
#  ('chest', 10),
#  ('thoracic', 9),
#  ('effusion', 9)]

### 36.5
# [('acute', 115),
#  ('lateral', 102),
#  ('lesion', 99),
#  ('found.', 98),
#  ('chest', 97),
#  ('effusion', 88),
#  ('pleural', 84),
#  ('PA', 78),
#  ('right', 71),
#  ('Chest', 67)]

###

In [16]:
# part 2: percentage.

In [17]:
# sorted_dict_37_5 = get_percentage_dict(df, "temperature_c", 37.5)
# sorted_dict_37_5['fever']

# sorted_dict_36_5 = get_percentage_dict(df, "temperature_c", 36.5)
# sorted_dict_36_5['fever']

# sorted_dict_38_5 = get_percentage_dict(df, "temperature_c", 38.5)
# sorted_dict_38_5['fever']

In [18]:
# words = s.split(" ")
# words = set(words)
# # remove stop words

# words = [w for w in words if not to_remove(w)]

# count_dict = {w: 0 for w in words}

# for w in tqdm(words):
#     for i, d in df[df[f"aug_{f}"] == occuring_v].iterrows():
#         if w in d["prompt"]:
#             count_dict[w] += 1
        
# sorted_dict = OrderedDict({k: v for k, v in reversed(sorted(count_dict.items(), key=lambda item: item[1]))})

# df_len = len(df[df[f"aug_{f}"] == occuring_v])
# p_w = [(k, v/ df_len) for k, v in sorted_dict.items()]
# p_w[:20]

# m_df = df[df['prompt'].apply(lambda x: "mediastinal" in x)]
# m_df.iloc[2]['prompt']

In [19]:
# 38.5

# [('male', 0.8947368421052632),
#  ('mediastinal', 0.868421052631579),
#  ('effusion', 0.8157894736842105),
#  ('normal', 0.7368421052631579),
#  ('fever', 0.6578947368421053),
#  ('lateral', 0.6578947368421053),
#  ('chest', 0.6578947368421053),
#  ('pneumonia', 0.631578947368421),
#  ('pleural', 0.631578947368421),
#  ('pneumothorax', 0.6052631578947368),
#  ('right', 0.5526315789473685),
#  ('lung', 0.5526315789473685),
#  ('acute', 0.5263157894736842),
#  ('silhouette', 0.5263157894736842),
#  ('seen', 0.5263157894736842),
#  ('pulmonary', 0.5),
#  ('atelectasis', 0.5),
#  ('opacity', 0.47368421052631576),
#  ('pneumonia.', 0.47368421052631576),
#  ('edema', 0.4473684210526316)]
# Q: Does Cardiac mediastinal often come with fever? or just because fever.

# 37.5 

# [('effusion', 0.9154704944178629),
#  ('fusion', 0.9154704944178629),
#  ('pneumothorax', 0.8038277511961722),
#  ('lateral', 0.7814992025518341),
#  ('chest', 0.7719298245614035),
#  ('pleura', 0.7496012759170654),
#  ('pleural', 0.7416267942583732),
#  ('male', 0.6539074960127592),
#  ('normal', 0.6491228070175439),
#  ('mediastinal', 0.645933014354067),
#  ('lung', 0.6220095693779905),
#  ('INDICATION:', 0.6108452950558214),
#  ('Female.', 0.5980861244019139),
#  ('pulm', 0.583732057416268),
#  ('pulmonary', 0.580542264752791),
#  ('view', 0.5789473684210527),
#  ('right', 0.5677830940988836),
#  ('TECHNIQUE', 0.5661881977671451),
#  ('able', 0.5566188197767146),
#  ('silhouette', 0.543859649122807)]

# 36.5

# [('found.', 0.8990825688073395),
#  ('found', 0.8990825688073395),
#  ('effusion', 0.8990825688073395),
#  ('lesion', 0.8990825688073395),
#  ('pneumo', 0.8440366972477065),
#  ('ateral', 0.8440366972477065),
#  ('lateral', 0.8348623853211009),
#  ('thorax', 0.8165137614678899),
#  ('pneumothorax', 0.8073394495412844),
#  ('mediastinal', 0.7981651376146789),
#  ('acute', 0.7889908256880734),
#  ('chest', 0.7706422018348624),
#  ('normal', 0.7339449541284404),
#  ('clear', 0.7155963302752294),
#  ('pleura', 0.6972477064220184),
#  ('pleural', 0.6880733944954128),
#  ('lung', 0.6788990825688074),
#  ('pulmonary', 0.6788990825688074),
#  ('view', 0.6055045871559633),
#  ('lungs', 0.5779816513761468)]
