# MIMIC CXR Labels Pre-processing
- Download the dataset from https://physionet.org/content/mimic-cxr-jpg/2.0.0/
- Run this notebook using the correct paths to get the preprocessed csv file

In [None]:
import pandas as pd
import os

## Load MIMIC csv files

In [None]:
mimic_csv_path = "PATH TO MIMIC CSV DIR WHERE YOU HAVE .csv.gz FILES"
mimic_files_path = "PATH TO MIMIC FILES DIR WHERE YOU HAVE p10-p19"
mimic_pt_files_path = "PATH WHERE YOU WILL SAVE YOUR PT VERSION OF MIMIC FILES"
os.listdir(mimic_csv_path), os.listdir(mimic_files_path)

In [None]:
mimic_metadata_df = pd.read_csv(os.path.join(mimic_csv_path, "mimic-cxr-2.0.0-metadata.csv.gz"))
mimic_split_df = pd.read_csv(os.path.join(mimic_csv_path, "mimic-cxr-2.0.0-split.csv.gz"))
mimic_labels_df = pd.read_csv(os.path.join(mimic_csv_path, "mimic-cxr-2.0.0-chexpert.csv.gz"))

## Join the csv files based on the `subject_id` and `study_id` columns

In [None]:
meta_split_df = mimic_metadata_df.set_index(['subject_id', 'study_id', 'dicom_id']).join(mimic_split_df.set_index(['subject_id', 'study_id', 'dicom_id'])).reset_index()

In [None]:
mimic_combined_df = meta_split_df.set_index(['subject_id', 'study_id']).join(mimic_labels_df.set_index(['subject_id', 'study_id'])).reset_index()

## Add the `path_jpg` column to the joined dataframe
- Note this that this column is the path to the orignal image file
- We need to convert the dataset to .pt version for training (done in the next step) 

In [None]:
mimic_combined_df['path_jpg'] = mimic_combined_df.apply(lambda row: f"{mimic_files_path}/p{str(row['subject_id'])[:2]}/p{row['subject_id']}/s{row['study_id']}/{row['dicom_id']}.jpg", axis=1)
mimic_combined_df['path'] = mimic_combined_df.apply(lambda row: f"{mimic_pt_files_path}/p{str(row['subject_id'])[:2]}/p{row['subject_id']}/s{row['study_id']}/{row['dicom_id']}.pt", axis=1)

## We will use only the "AP" and "PA" views

In [None]:
mimic_preprocessed_df = mimic_combined_df[mimic_combined_df["ViewPosition"].isin(["AP", "PA"])]

In [None]:
mimic_preprocessed_df.head()

In [None]:
mimic_preprocessed_df.shape

## Save df

In [None]:
mimic_preprocessed_df.to_csv("PATH TO SAVE PREPROCESSED MIMIC CSV FILE", index=False)

# Convert .JPG to .PT
- We convert the images to .pt files to speed up the training process

In [None]:
import pandas as pd
import os
import tqdm
import matplotlib.pyplot as plt
import torch

In [None]:
mimic_pt_files_path = "PATH WHERE YOU WILL SAVE YOUR PT VERSION OF MIMIC FILES"
os.makedirs(mimic_pt_files_path, exist_ok=True)

In [None]:
mimic_preprocessed_df = pd.read_csv("PATH TO PREPROCESSED MIMIC CSV FILE")

In [None]:
mimic_jpg_paths = mimic_preprocessed_df["path_jpg"].tolist()

for i, jpg_path in tqdm(mimic_jpg_paths, total=len(mimic_jpg_paths)):
    pt_path = jpg_path.replace(mimic_files_path, mimic_pt_files_path) # change the base dir path
    pt_path = pt_path.replace(".jpg", ".pt") 
    if not os.path.exists(pt_path):
        img = plt.imread(jpg_path)
        img = torch.tensor(img).permute(2, 0, 1)
        # scale between 0 and 1
        img = (img - img.min()) / (img.max() - img.min())
        torch.save(img, pt_path)