# This is the file to generate training dataframe for the **FULL** REFLACX dataset.

For More detail, have a look of `[1] reflacx_dataset_preprocessing.ipynb`.

In [1]:
import os
import pandas as pd

from data.paths import TabularDataPaths
from utils import print as print_f
from utils.preprocessing import get_split_list
from data.constants import XAMI_MIMIC_PATH, SPREADSHEET_FOLDER
from utils.init import reproducibility

In [2]:
seed = 0
reproducibility(0)
XAMI_MIMIC_PATH_str = "{XAMI_MIMIC_PATH}"

In [3]:
reflacx_meta_df = pd.read_csv(
    TabularDataPaths.SpreadSheet.get_sreadsheet(
        XAMI_MIMIC_PATH, TabularDataPaths.SpreadSheet.REFLACX.metadata,
    )
)
reflacx_meta_df.head(5)


Unnamed: 0.1,Unnamed: 0,id,split,eye_tracking_data_discarded,image,dicom_id,subject_id,image_size_x,image_size_y,Airway wall thickening,...,Support devices,Wide mediastinum,Abnormal mediastinal contour,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality
0,0,P102R108387,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
1,1,P102R379837,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,
2,2,P102R558314,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,4.0,...,True,0.0,,,,,,,,
3,3,P102R765317,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,2.0,,,,,,,,
4,4,P102R915878,train,False,physionet.org/files/mimic-cxr/2.0.0/files/p18/...,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,2544,3056,0.0,...,True,0.0,,,,,,,,


In [4]:
labels_cols = [
    "Airway wall thickening",
    "Atelectasis",
    "Consolidation",
    "Enlarged cardiac silhouette",
    "Fibrosis",
    "Fracture",
    "Groundglass opacity",
    "Pneumothorax",
    "Pulmonary edema",
    "Wide mediastinum",
    "Abnormal mediastinal contour",
    "Acute fracture",
    "Enlarged hilum",
    "Quality issue",
    "Support devices",
    "Hiatal hernia",
    "High lung volume / emphysema",
    "Interstitial lung disease",
    "Lung nodule or mass",
    "Pleural abnormality",
]


## inlcude the repetitive columns.
all_disease_cols = [
    "Airway wall thickening",
    "Atelectasis",
    "Consolidation",
    "Emphysema",
    "Enlarged cardiac silhouette",
    "Fibrosis",
    "Fracture",
    "Groundglass opacity",
    "Mass",
    "Nodule",
    "Other",
    "Pleural effusion",
    "Pleural thickening",
    "Pneumothorax",
    "Pulmonary edema",
    "Wide mediastinum",
    "Abnormal mediastinal contour",
    "Acute fracture",
    "Enlarged hilum",
    "Hiatal hernia",
    "High lung volume / emphysema",
    "Interstitial lung disease",
    "Lung nodule or mass",
    "Pleural abnormality",
]


In [5]:
# Checking the columns that contian nan values.
print_f.print_title("Columns have NaN value")
print([col for col in reflacx_meta_df.columns if reflacx_meta_df[col].isna().any()])


['Airway wall thickening', 'Emphysema', 'Fibrosis', 'Fracture', 'Mass', 'Nodule', 'Other', 'Pleural effusion', 'Pleural thickening', 'Quality issue', 'Wide mediastinum', 'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum', 'Hiatal hernia', 'High lung volume / emphysema', 'Interstitial lung disease', 'Lung nodule or mass', 'Pleural abnormality']


In [6]:
# Filling nan for numerical cols.
reflacx_meta_df[all_disease_cols] = reflacx_meta_df[all_disease_cols].fillna(0)

# Folling nan for boolean cols.
reflacx_meta_df[["Quality issue", "Support devices"]] = reflacx_meta_df[
    ["Quality issue", "Support devices"]
].fillna(False)


In [7]:
print_f.print_title("Columns have NaN value")
print([col for col in reflacx_meta_df.columns if reflacx_meta_df[col].isna().any()])

[]


In [8]:
reflacx_meta_df["Lung nodule or mass"] = reflacx_meta_df[
    ["Lung nodule or mass", "Mass", "Nodule"]
].max(axis=1)
del reflacx_meta_df["Mass"]
del reflacx_meta_df["Nodule"]

reflacx_meta_df["High lung volume / emphysema"] = reflacx_meta_df[
    ["High lung volume / emphysema", "Emphysema"]
].max(axis=1)
del reflacx_meta_df["Emphysema"]


reflacx_meta_df["Pleural abnormality"] = reflacx_meta_df[
    ["Pleural abnormality", "Pleural thickening", "Pleural effusion"]
].max(axis=1)
del reflacx_meta_df["Pleural thickening"]
del reflacx_meta_df["Pleural effusion"]



In [9]:
physio_file_path = "E:\\physionet.org\\files\\"


cxr_meta_df = pd.read_csv(
    TabularDataPaths.SpreadSheet.get_sreadsheet(
        XAMI_MIMIC_PATH, TabularDataPaths.SpreadSheet.cxr_meta
    )
)

core_patients_df = pd.read_csv(os.path.join( physio_file_path, "mimiciv\\2.0\\hosp\\patients.csv.gz"))


In [13]:
merged_df = (
    reflacx_meta_df.merge(
        cxr_meta_df,
        "left",
        left_on="dicom_id",
        right_on="dicom_id",
        suffixes=("", "cxr"),
    )
    .merge(core_patients_df, "left", on="subject_id", suffixes=("", "_patient"))
)

merged_df["age"] = merged_df["anchor_age"] + (
    merged_df["StudyDate"].apply(lambda x: int(str(x)[:4])) - merged_df["anchor_year"]
)

In [14]:
## Calculate the age according to the time when the CXR taken.
merged_df["image_path"] = (
    f"{XAMI_MIMIC_PATH_str}\patient_"
    + merged_df["subject_id"].astype(str)
    + "\CXR-JPG\s"
    + merged_df["study_id"].astype(str)
    + "\\"
    + merged_df["dicom_id"].astype(str)
    + ".jpg"
)
merged_df["anomaly_location_ellipses_path"] = (
    f"{XAMI_MIMIC_PATH_str}\patient_"
    + merged_df["subject_id"].astype(str)
    + "\REFLACX\\"
    + merged_df["id"].astype(str)
    + "\\anomaly_location_ellipses.csv"
)
merged_df = merged_df[~merged_df["eye_tracking_data_discarded"]]




In [15]:
needed_columns = [
    # ids
    "id",
    "dicom_id",
    "subject_id",
    "stay_id",
    "study_id",
    # image meta
    "image_path",
    "ViewPosition",
    "image_size_x",
    "image_size_y",
    "anomaly_location_ellipses_path",
    # labels
    "Airway wall thickening",
    "Atelectasis",
    "Consolidation",
    "Enlarged cardiac silhouette",
    "Fibrosis",
    "Groundglass opacity",
    "Other",
    "Pneumothorax",
    "Pulmonary edema",
    "Quality issue",
    "Support devices",
    "Wide mediastinum",
    "Abnormal mediastinal contour",
    "Acute fracture",
    "Enlarged hilum",
    "Hiatal hernia",
    "High lung volume / emphysema",
    "Interstitial lung disease",
    "Lung nodule or mass",
    "Pleural abnormality",
    # 'Fracture' # don't have any positive case

    # patients
    "age",
    "gender",
]

merged_df = merged_df[needed_columns]


In [16]:
# Checking the columns that contian nan values (from other joined table).
print_f.print_title("Columns have missing value in merged_df")
print(
    [
        (col, sum(merged_df[col].isna()))
        for col in merged_df.columns
        if merged_df[col].isna().any()
    ]
)

[('stay_id', 2362), ('age', 7), ('gender', 7)]


In [17]:
merged_df['split'] = get_split_list(len(merged_df))

In [18]:
merged_df.head(5)

Unnamed: 0,id,dicom_id,subject_id,stay_id,study_id,image_path,ViewPosition,image_size_x,image_size_y,anomaly_location_ellipses_path,...,Acute fracture,Enlarged hilum,Hiatal hernia,High lung volume / emphysema,Interstitial lung disease,Lung nodule or mass,Pleural abnormality,age,gender,split
0,P102R108387,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,{XAMI_MIMIC_PATH}\patient_18111516\CXR-JPG\s55...,AP,2544,3056,{XAMI_MIMIC_PATH}\patient_18111516\REFLACX\P10...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,F,train
1,P102R379837,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,{XAMI_MIMIC_PATH}\patient_18111516\CXR-JPG\s55...,AP,2544,3056,{XAMI_MIMIC_PATH}\patient_18111516\REFLACX\P10...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,F,test
2,P102R558314,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,{XAMI_MIMIC_PATH}\patient_18111516\CXR-JPG\s55...,AP,2544,3056,{XAMI_MIMIC_PATH}\patient_18111516\REFLACX\P10...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,F,train
3,P102R765317,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,{XAMI_MIMIC_PATH}\patient_18111516\CXR-JPG\s55...,AP,2544,3056,{XAMI_MIMIC_PATH}\patient_18111516\REFLACX\P10...,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,69.0,F,train
4,P102R915878,34cedb74-d0996b40-6d218312-a9174bea-d48dc033,18111516,32067002.0,55032240,{XAMI_MIMIC_PATH}\patient_18111516\CXR-JPG\s55...,AP,2544,3056,{XAMI_MIMIC_PATH}\patient_18111516\REFLACX\P10...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,F,train


In [19]:
from collections import OrderedDict

count_map = dict(merged_df[labels_cols].sum(axis=0))
OrderedDict({k: v for k, v in sorted(count_map.items(), key=lambda item: item[1])})

KeyError: "['Fracture'] not in index"

In [20]:
merged_df.to_csv(os.path.join(SPREADSHEET_FOLDER, "reflacx_cxr.csv"))