# Prerequisites

- Datasets and Codebases

    *HAIM*
    https://www.physionet.org/content/haim-multimodal/1.0.1/

    *MIMIC-CXR*
    https://www.physionet.org/content/mimic-cxr-jpg/2.0.0/

    *MIMIC-IV*
    https://www.physionet.org/content/mimiciv/1.0/

- RUN *1_filter.ipynb* to generate *patients_cxr_diagnoses.csv*


# Preprocessing

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast

In [2]:
cxr_data = pd.read_csv(
    "/nvme/datasets/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert.csv"
)
cxr_data.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


Titles of CXR diagnoses

In [3]:
cxr_diagnoses = cxr_data.drop(
    ["subject_id", "study_id", "No Finding", "Support Devices"], axis=1
).columns
print(f"Number of diagnoses: {len(cxr_diagnoses)}")
print(f"Diagnoses: {cxr_diagnoses}")

Number of diagnoses: 12
Diagnoses: Index(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity',
       'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax'],
      dtype='object')


Get the pickled files that exist in both haim_mimiciv_key_ids.csv and patients_cxr_diagnoses.csv

In [4]:
df = pd.read_csv("haim_mimiciv_key_ids.csv")
# TODO
df = df.head(5000)

folder_path = "data_ehr_cxr"

df_patients_cxr_diagnoses = pd.read_csv("patients_cxr_diagnoses.csv")

df = df[
    (df["subject_id"].isin(df_patients_cxr_diagnoses["subject_id"]))
    & (df["hadm_id"].isin(df_patients_cxr_diagnoses["hadm_id"]))
    & (df["stay_id"].isin(df_patients_cxr_diagnoses["stay_id"]))
]

print(f"Number of patients: {df['subject_id'].nunique()}")
df.head()

Number of patients: 642


Unnamed: 0,hadm_id,stay_id,subject_id
4,26184834.0,37510196.0,10001884.0
8,28662225.0,33987268.0,10002428.0
9,28662225.0,38875437.0,10002428.0
10,26295318.0,38392119.0,10002430.0
13,22774359.0,30676350.0,10003019.0


In [5]:
def read_pickle(index):
    subject_id = df["subject_id"].iloc[index]
    hadm_id = df["hadm_id"].iloc[index]
    stay_id = df["stay_id"].iloc[index]
    file_index = df.index[index]
    print(f"subject_id: {subject_id}")
    print(f"hadm_id: {hadm_id}")
    print(f"stay_id: {stay_id}")
    print(f"file_name: {file_index}")

    if file_index < 1000:
        folder_name = "folder00/"
        file_path = folder_name + "00000"
    elif file_index < 2000:
        folder_name = "folder01/"
        file_path = folder_name + "0000"
    elif file_index < 3000:
        folder_name = "folder02/"
        file_path = folder_name + "0000"
    elif file_index < 4000:
        folder_name = "folder03/"
        file_path = folder_name + "0000"
    elif file_index < 5000:
        folder_name = "folder04/"
        file_path = folder_name + "0000"

    if file_index < 10:
        file_path += "00" + str(file_index)
    elif file_index < 100:
        file_path += "0" + str(file_index)
    elif file_index < 1000:
        file_path += str(file_index)
    else:
        file_path += str(file_index)

    file_path = f"Sample_Multimodal_Patient_Files/" + file_path + ".pkl"
    patient_data = pd.read_pickle(file_path)
    # print(patient_data.cxr.info())

    return subject_id, hadm_id, stay_id, patient_data

Merge CXR data and chartevents of all patients ine CSV files

In [6]:
# if not os.path.exists(folder_path):
#     os.makedirs(folder_path)

for i, _ in enumerate(df.iterrows()):
    subject_id, hadm_id, stay_id, patient_data = read_pickle(i)

    # CXR
    filtered_cxr = patient_data.cxr.dropna(subset=cxr_diagnoses, how="all")
    filtered_cxr_copy = filtered_cxr.copy()
    filtered_cxr_copy["hadm_id"] = hadm_id
    filtered_cxr_copy["stay_id"] = stay_id
    filtered_cxr_copy["image_path"] = (
        filtered_cxr_copy["Img_Folder"] + "/" + filtered_cxr_copy["Img_Filename"]
    )
    filtered_cxr_copy["report_path"] = (
        filtered_cxr_copy["Note_folder"] + "/" + filtered_cxr_copy["Note_file"]
    )

    if len(filtered_cxr_copy) > 0:
        cxr_columns = [
            "subject_id",
            "hadm_id",
            "stay_id",
            "study_id",
            "image_path",
            "Note",
            "report_path",
        ]
        cxr_columns.extend(cxr_diagnoses.values)
        cxr_data = filtered_cxr_copy.loc[:, cxr_columns]
        save_cxr_file_name = f"{folder_path}/cxr_data.csv"
        cxr_data.to_csv(
            save_cxr_file_name,
            mode="a",
            header=(not os.path.exists(save_cxr_file_name)),
            index=False,
        )

        # Chartevents
        filtered_chartevents = patient_data.chartevents.dropna(
            subset=["value", "valueuom", "label"]
        )

        if len(filtered_chartevents) > 0:
            chart_data = filtered_chartevents[
                [
                    "subject_id",
                    "hadm_id",
                    "stay_id",
                    "itemid",
                    "value",
                    "valueuom",
                    "label",
                    "warning",
                    "lownormalvalue",
                    "highnormalvalue",
                ]
            ]
            save_charts_file_name = f"{folder_path}/charts_data.csv"
            chart_data.to_csv(
                save_charts_file_name,
                mode="a",
                header=(not os.path.exists(save_charts_file_name)),
                index=False,
            )

subject_id: 10001884.0
hadm_id: 26184834.0
stay_id: 37510196.0
file_name: 4
subject_id: 10002428.0
hadm_id: 28662225.0
stay_id: 33987268.0
file_name: 8
subject_id: 10002428.0
hadm_id: 28662225.0
stay_id: 38875437.0
file_name: 9
subject_id: 10002430.0
hadm_id: 26295318.0
stay_id: 38392119.0
file_name: 10
subject_id: 10003019.0
hadm_id: 22774359.0
stay_id: 30676350.0
file_name: 13
subject_id: 10003400.0
hadm_id: 20214994.0
stay_id: 32128372.0
file_name: 14
subject_id: 10004235.0
hadm_id: 24181354.0
stay_id: 34100191.0
file_name: 18
subject_id: 10007795.0
hadm_id: 28477357.0
stay_id: 31921355.0
file_name: 23
subject_id: 10011365.0
hadm_id: 26712576.0
stay_id: 37153661.0
file_name: 29
subject_id: 10014354.0
hadm_id: 27487226.0
stay_id: 34600477.0
file_name: 45
subject_id: 10014354.0
hadm_id: 27487226.0
stay_id: 38017367.0
file_name: 46
subject_id: 10020944.0
hadm_id: 29974575.0
stay_id: 30757476.0
file_name: 80
subject_id: 10021487.0
hadm_id: 28998349.0
stay_id: 38197705.0
file_name: 82
su

In [7]:
cxr_data = pd.read_csv(f"{folder_path}/cxr_data.csv")
cxr_data.drop_duplicates(subset=["study_id"], inplace=True)
print(f"Number of unique studies: {len(cxr_data)}")
print(f"Number of unique patients: {cxr_data['subject_id'].nunique()}")
print(f"Number of unique admissions: {cxr_data['hadm_id'].nunique()}")
print(f"Number of unique stays: {cxr_data['stay_id'].nunique()}")
print(cxr_data.head())

Number of unique studies: 6084
Number of unique patients: 614
Number of unique admissions: 614
Number of unique stays: 614
   subject_id     hadm_id     stay_id    study_id  \
0  10001884.0  26184834.0  37510196.0  57156853.0   
1  10001884.0  26184834.0  37510196.0  55893591.0   
2  10001884.0  26184834.0  37510196.0  56349965.0   
4  10001884.0  26184834.0  37510196.0  56308417.0   
6  10001884.0  26184834.0  37510196.0  56722923.0   

                                          image_path  \
0  files/p10/p10001884/s57156853/9fd47edd-0708720...   
1  files/p10/p10001884/s55893591/8d4eb7a8-0d3c4f7...   
2  files/p10/p10001884/s56349965/79863f89-595bf19...   
4  files/p10/p10001884/s56308417/f8d7008c-ed9b419...   
6  files/p10/p10001884/s56722923/c1ad3e27-62d05ef...   

                                                Note  \
0                                   FINAL REPORT\...   
1                                   FINAL REPORT\...   
2                                   FINAL REPORT\... 

In [8]:
charts_data = pd.read_csv(f"{folder_path}/charts_data.csv")


# If a patient has multiple values for the same label, we keep the first one
charts_data.drop_duplicates(
    subset=["subject_id", "hadm_id", "stay_id", "label"], inplace=True, keep="first"
)

In [9]:
merged_data = pd.merge(cxr_data, charts_data, on=["subject_id", "hadm_id", "stay_id"])
merged_data.to_csv(f"{folder_path}/merged_data.csv", index=False)
merged_data.head()

Unnamed: 0,subject_id,hadm_id,stay_id,study_id,image_path,Note,report_path,Atelectasis,Cardiomegaly,Consolidation,...,Pleural Other,Pneumonia,Pneumothorax,itemid,value,valueuom,label,warning,lownormalvalue,highnormalvalue
0,10001884.0,26184834.0,37510196.0,57156853.0,files/p10/p10001884/s57156853/9fd47edd-0708720...,FINAL REPORT\...,files/p10/p10001884/s57156853.txt,,,,...,,,,220292.0,5.0,L/min,Minute Volume Alarm - Low,0.0,,
1,10001884.0,26184834.0,37510196.0,57156853.0,files/p10/p10001884/s57156853/9fd47edd-0708720...,FINAL REPORT\...,files/p10/p10001884/s57156853.txt,,,,...,,,,220293.0,18.0,L/min,Minute Volume Alarm - High,0.0,,
2,10001884.0,26184834.0,37510196.0,57156853.0,files/p10/p10001884/s57156853/9fd47edd-0708720...,FINAL REPORT\...,files/p10/p10001884/s57156853.txt,,,,...,,,,220339.0,5.0,cmH2O,PEEP set,0.0,,
3,10001884.0,26184834.0,37510196.0,57156853.0,files/p10/p10001884/s57156853/9fd47edd-0708720...,FINAL REPORT\...,files/p10/p10001884/s57156853.txt,,,,...,,,,223873.0,50.0,cmH2O,Paw High,0.0,,
4,10001884.0,26184834.0,37510196.0,57156853.0,files/p10/p10001884/s57156853/9fd47edd-0708720...,FINAL REPORT\...,files/p10/p10001884/s57156853.txt,,,,...,,,,223874.0,900.0,mL,Vti High,0.0,,


Split the data into Atelecasis, Cardiomegaly, Consolidation, Edema, Pleural Effusion, Pneumonia, Pneumothorax

In [10]:
df_merged_data = pd.read_csv(f"{folder_path}/merged_data.csv")
print(f"Number of records: {len(df_merged_data)}")
df_merged_data.head()

Number of records: 477507


Unnamed: 0,subject_id,hadm_id,stay_id,study_id,image_path,Note,report_path,Atelectasis,Cardiomegaly,Consolidation,...,Pleural Other,Pneumonia,Pneumothorax,itemid,value,valueuom,label,warning,lownormalvalue,highnormalvalue
0,10001884.0,26184834.0,37510196.0,57156853.0,files/p10/p10001884/s57156853/9fd47edd-0708720...,FINAL REPORT\...,files/p10/p10001884/s57156853.txt,,,,...,,,,220292.0,5.0,L/min,Minute Volume Alarm - Low,0.0,,
1,10001884.0,26184834.0,37510196.0,57156853.0,files/p10/p10001884/s57156853/9fd47edd-0708720...,FINAL REPORT\...,files/p10/p10001884/s57156853.txt,,,,...,,,,220293.0,18.0,L/min,Minute Volume Alarm - High,0.0,,
2,10001884.0,26184834.0,37510196.0,57156853.0,files/p10/p10001884/s57156853/9fd47edd-0708720...,FINAL REPORT\...,files/p10/p10001884/s57156853.txt,,,,...,,,,220339.0,5.0,cmH2O,PEEP set,0.0,,
3,10001884.0,26184834.0,37510196.0,57156853.0,files/p10/p10001884/s57156853/9fd47edd-0708720...,FINAL REPORT\...,files/p10/p10001884/s57156853.txt,,,,...,,,,223873.0,50.0,cmH2O,Paw High,0.0,,
4,10001884.0,26184834.0,37510196.0,57156853.0,files/p10/p10001884/s57156853/9fd47edd-0708720...,FINAL REPORT\...,files/p10/p10001884/s57156853.txt,,,,...,,,,223874.0,900.0,mL,Vti High,0.0,,


In [11]:
for label in cxr_diagnoses:
    # Highest correlated charts w.r.t the label
    if label == "Atelectasis":
        corr_file_path = f"{folder_path}/corr/corr_Atelectasis.txt"

    elif label == "Cardiomegaly":
        corr_file_path = f"{folder_path}/corr/corr_Cardiomegaly.txt"
    elif label == "Consolidation":
        corr_file_path = f"{folder_path}/corr/corr_Consolidation.txt"

    elif label == "Edema":
        corr_file_path = f"{folder_path}/corr/corr_Edema.txt"

    elif label == "Enlarged Cardiomediastinum":
        corr_file_path = f"{folder_path}/corr/corr_Enlarged Cardiomediastinum.txt"

    elif label == "Fracture":
        corr_file_path = f"{folder_path}/corr/corr_Fracture.txt"

    elif label == "Lung Lesion":
        corr_file_path = f"{folder_path}/corr/corr_Lung Lesion.txt"

    elif label == "Lung Opacity":
        corr_file_path = f"{folder_path}/corr/corr_Lung Opacity.txt"

    elif label == "Pleural Effusion":
        corr_file_path = f"{folder_path}/corr/corr_Pleural Effusion.txt"

    elif label == "Pleural Other":
        corr_file_path = f"{folder_path}/corr/corr_Pleural Other.txt"

    elif label == "Pneumonia":
        corr_file_path = f"{folder_path}/corr/corr_Pneumonia.txt"

    elif label == "Pneumothorax":
        corr_file_path = f"{folder_path}/corr/corr_Pneumothorax.txt"

    # Read the content of the file
    with open(corr_file_path, "r") as file:
        charts = [ast.literal_eval(line) for line in file]

    charts = [chart[0] for chart in charts]

    # # Take only the top 3 charts
    # if len(charts) >= 3:
    #     charts = charts[:3]

    # Get rows where label is 1 and all other labels are 0/-1/Nan
    # This is to make sure that the label is the only positive label
    positive_label_data = df_merged_data[
        (df_merged_data[label] == 1)
        # & (~df_merged_data.drop(label, axis=1).eq(1).any(axis=1))
    ]

    filtered_charts = positive_label_data.copy()

    # Filter the records to only the ones that are in the charts list
    filtered_charts = filtered_charts[filtered_charts["label"].isin(charts)]

    if len(filtered_charts) == 0:
        continue

    # If a patient has multiple values for the same label (chart), we keep the first one
    filtered_charts = filtered_charts.drop_duplicates(
        subset=["subject_id", "hadm_id", "stay_id", "label"], inplace=False
    )

    # Take only 5 records per patient
    filtered_charts = filtered_charts.groupby(
        ["subject_id", "hadm_id", "stay_id"]
    ).head(5)

    # concatenate the value, valueuom and label columns into 1 column "charts"
    filtered_charts["charts"] = filtered_charts.apply(
        lambda x: " ".join([str(x["value"]), str(x["valueuom"]), str(x["label"])]),
        axis=1,
    )
    # charts of same admission will be concatenated into a single row
    filtered_charts["charts"] = filtered_charts.groupby(
        ["subject_id", "hadm_id", "stay_id"]
    )["charts"].transform(lambda x: ", ".join(x))

    filtered_charts.drop_duplicates(
        subset=["subject_id", "hadm_id", "stay_id", "charts"], inplace=True
    )

    columns_to_keep = [
        "study_id",
        "subject_id",
        "hadm_id",
        "stay_id",
        "image_path",
        "report_path",
        "Note",
        "charts",
        label,
    ]

    positive_label_data = filtered_charts[columns_to_keep]

    print(positive_label_data)

    # Get rows where label is 0
    negative_label_data = df_merged_data[df_merged_data[label] == 0]

    filtered_charts = negative_label_data.copy()
    # Filter the records to only the ones that are in the charts list
    filtered_charts = filtered_charts[filtered_charts["label"].isin(charts)]

    if len(filtered_charts) == 0:
        continue

    # If a patient has multiple values for the same label (chart), we keep the first one
    filtered_charts = filtered_charts.drop_duplicates(
        subset=["subject_id", "hadm_id", "stay_id", "label"], inplace=False
    )

    # Take only 5 records per patient
    filtered_charts = filtered_charts.groupby(
        ["subject_id", "hadm_id", "stay_id"]
    ).head(5)

    # concatenate the value, valueuom and label columns into 1 column "charts"
    filtered_charts["charts"] = filtered_charts.apply(
        lambda x: " ".join([str(x["value"]), str(x["valueuom"]), str(x["label"])]),
        axis=1,
    )
    # charts of same admission will be concatenated into a single row
    filtered_charts["charts"] = filtered_charts.groupby(
        ["subject_id", "hadm_id", "stay_id"]
    )["charts"].transform(lambda x: ", ".join(x))

    filtered_charts.drop_duplicates(
        subset=["subject_id", "hadm_id", "stay_id", "charts"], inplace=True
    )

    columns_to_keep = [
        "study_id",
        "subject_id",
        "hadm_id",
        "stay_id",
        "image_path",
        "report_path",
        "Note",
        "charts",
        label,
    ]

    negative_label_data = filtered_charts[columns_to_keep]
    print(negative_label_data)

    save_file_name = f"{folder_path}/{label}.csv"
    positive_label_data.to_csv(
        save_file_name,
        mode="a",
        header=(not os.path.exists(save_file_name)),
        index=False,
    )
    negative_label_data.to_csv(
        save_file_name,
        mode="a",
        header=(not os.path.exists(save_file_name)),
        index=False,
    )

          study_id  subject_id     hadm_id     stay_id  \
10391   57544796.0  10021487.0  28998349.0  38197705.0   
12640   59713053.0  10021927.0  24623461.0  34575919.0   
15928   58578322.0  10036086.0  28728587.0  38809220.0   
30733   51103039.0  10123063.0  24799331.0  30166935.0   
45500   54754996.0  10157256.0  21943311.0  31818753.0   
68126   55163602.0  10222587.0  29561541.0  32418443.0   
106558  56996892.0  10304606.0  25532105.0  36975538.0   
123529  59027741.0  10326773.0  24076064.0  38172427.0   
125084  57299722.0  10335293.0  23619901.0  32283185.0   
127536  58472100.0  10337896.0  23504565.0  33920408.0   
131573  59914738.0  10355745.0  24924037.0  38030074.0   
139760  55638347.0  10385501.0  26477308.0  31960106.0   
141575  58679660.0  10386233.0  22164804.0  39681051.0   
149105  56434528.0  10398981.0  28167602.0  34166660.0   
154928  52186298.0  10422699.0  27044055.0  39634792.0   
155046  55774687.0  10423466.0  24868706.0  38820625.0   
156764  560542