In [None]:
#!/usr/bin/env python

__author__ = "Vivek Chamala"
__copyright__ = "Copyright 2023"
__credits__ = ["Andrew D'Amico", "Christoper Alexander", "Katya Nosulko", "Vivek Chamala", "Matthew Conger"]
__license__ = ""
__version__ = "0.0.1"
__maintainer__ = "Andrew Damico"
__email__ = "andrew.damico@u.northwestern.edu"

In [1]:
import pandas as pd

In [2]:
# BASE_DIR = '/Users/vivek/Documents/MSDS/498/data/mimic-cxr-jpg'
BASE_DIR = '/Volumes/PRO-G40/msds498'
metadata_df = pd.read_csv(f'{BASE_DIR}/mimic-cxr-2.0.0-metadata.csv')
label_df = pd.read_csv(f'{BASE_DIR}/mimic-cxr-2.0.0-negbio.csv')

In [19]:
def get_image_metadata(mdf):
    # Select studies with only with one image
    df = mdf.groupby(['subject_id','study_id']).filter(lambda x: len(x)==1)
    
    #Select only antero-posterior & Erect code images
    df = df.loc[(df['ViewCodeSequence_CodeMeaning']=='antero-posterior')
            & (df['PatientOrientationCodeSequence_CodeMeaning']=='Erect')]
    
    data_list = []
    
    for index, row in df.iterrows():
        sub_id = str(row['subject_id'])
        st_id = row['study_id']
        dicom_id = row['dicom_id']
        rows = row['Rows']
        columns = row['Columns']
        data_list.append(
            {
                "dicom_id": str(dicom_id),
                "subject_id": str(int(sub_id)),
                "study_id": str(int(st_id)),
                "image": f"files/p{sub_id[:2]}/p{sub_id}/s{st_id}/{dicom_id}.jpg",
                "rows": rows,
                "columns": columns,
            }
        )
    
    data_df = pd.DataFrame(data_list)
    data_df = data_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return data_df

def get_population_labels(data_df, label_df):
    label_df = label_df.astype({'subject_id': 'str', 'study_id': 'str'})
    data_labels_df = pd.merge(data_df, label_df, how='inner', on=['subject_id','study_id'])
    return data_labels_df

def get_train_val_test(data_df):
    cnt = data_label_df.dicom_id.count()
    train_cnt = int(round(cnt*.7,0))
    val_cnt = int(round((cnt - train_cnt)/2,0)) + train_cnt
    
    train_df = data_df[:train_cnt]
    val_df = data_df[train_cnt:val_cnt]
    test_df = data_df[val_cnt:]
    
    print(f'Train Count = {train_df.dicom_id.count()}')
    print(f'Validation Count = {val_df.dicom_id.count()}')
    print(f'Test Count = {test_df.dicom_id.count()}')
    
    return train_df, val_df, test_df

def save_train_test_val(train_df, val_df, test_df):
    output_dir = f'{BASE_DIR}/output'
    train_df.to_csv(f"{output_dir}/train.csv", index=False, sep=",")
    val_df.to_csv(f"{output_dir}/validation.csv", index=False, sep=",")
    test_df.to_csv(f"{output_dir}/test.csv", index=False, sep=",")
    return
    

In [20]:
data_df = get_image_metadata(metadata_df)

In [21]:
data_label_df = get_population_labels(data_df, label_df)

In [6]:
# train_df, val_df, test_df = get_train_val_test(data_df)

In [7]:
# save_train_test_val(train_df, val_df, test_df)

In [22]:
output_dir = f'{BASE_DIR}/output/all'

In [23]:
data_label_df.to_csv(f"{output_dir}/population_images_labels.csv", index=False)