In [8]:
import os
import ray
import dask
import pandas as pd
from PIL import Image
import time
import numpy as np
import datetime
import os



In [2]:
reports_path='../data/raw/physionet.org/files/mimic-cxr/2.0.0/cxr-study-list.csv'
labels_path='../data/raw/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert.csv'
images_path= '../data/raw/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-split.csv'

# load dfs
reports=pd.read_csv(reports_path)
labels=pd.read_csv(labels_path)
images=pd.read_csv(images_path)

# Merge the reports with the labels


In [3]:
merged_inner = pd.merge(left=reports,how='inner', right=labels, on='study_id',validate='one_to_one' )

In [4]:
merged_o_m= pd.merge(left=merged_inner,how='inner',right=images,on='study_id',validate='one_to_many')

In [5]:
def get_concat_h(im1, im2):
    dst = Image.new('RGB', (im1.width + im2.width, im1.height))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (im1.width, 0))
    return dst

In [6]:
def merge_studies(df):
    global counter
    global times
    
    tick=time.time()
    # Get the names of the images and the study they belong to. 
    img_names=df['dicom_id'].to_list()
    study_path=df['path'].to_list()[0] # The path here is still not relative to the 
                                       # data folder, it will be modified in the load 
                                       # report section (comments)
    study_id=str(df.study_id.to_list()[0])
    
    if len(df)>1:
    # If there is more than one image in this study:
        # print(study_path)
        full_img_paths=[]
        
        #Construct the paths of the images that belong to the same study. 
        for img_name in img_names:
        # For images with that belong to the same study
            full_img_path='../data/raw/physionet.org/files/mimic-cxr-jpg/2.0.0/'+study_path[:-4]+'/'+img_name+'.jpg'
            full_img_paths.append(full_img_path)
        # print(full_img_paths)
        
        
        # Concatenate the images of the same study together. 
        while len(full_img_paths)>1:
            img1=Image.open(full_img_paths.pop(0))
            img2=Image.open(full_img_paths.pop(0))
            concat_path='../data/raw/mimic_fusions/'+study_id+'.jpg'
            concat=get_concat_h(img1,img2).save(concat_path)
            full_img_paths.append(concat_path)
        
    # Find the report and labels: Since all of the images in the input df belong to one 
    # study, they must have the same report and labels

    # Load report into the contesnts variable
    full_study_path='../data/raw/physionet.org/files/mimic-cxr/2.0.0/'+study_path
    try:
        with open(full_study_path) as f:
            contents = f.readlines()
    except:
        print('problems reading\n'+full_study_path)
        contents=[]
    contents=''.join(contents).strip()
    report = contents

    # Load the labels into a list
    label_cols=['Atelectasis', 'Cardiomegaly',
               'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture',
               'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion',
               'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']

    labels=df[label_cols].iloc[0].to_list()


    # Make a df with all of the samples for this study:
    # Samples will have 3 categories: 
    #     multi: images in this category are the result of the concatenation of
    #         images of the same study and have only one asociated label and report. 
    #     unique: images in this category are part of a study that only had one 
    #         image as part of it and have only one asociated label and report. 
    #     broken: images in this category belong to a study that had more than one
    #         image in it but they were broken down into individual examples with
    #         repeated information (labels and report) for each of them.  
    
    rows=[]
    # Make the multi type row
    if len(df)>1:
        multi_row=[concat_path]+labels+[report]+['multi'] # 1 row for merged images
        rows.append(multi_row)
    
    # Make the broken type rows
        paths=[]
        for img_name in img_names:
        # For images with that belong to the same study
            path='../data/raw/physionet.org/files/mimic-cxr-jpg/2.0.0/'+study_path[:-4]+'/'+img_name+'.jpg'
            broken_row=[path]+labels+[report]+['broken']
            rows.append(broken_row)
    # Make the unique type row
    if len(df)==1:
        
        #print(study_path)
        img_path='../data/raw/physionet.org/files/mimic-cxr-jpg/2.0.0/'+study_path[:-4]+'/'+img_names[0]+'.jpg'
        unique_row = [img_path]+labels+[report]+['unique']
        rows.append(unique_row)
        # print(unique_row)
    
    # Make a final dataframe with all types of rows.
    rows_df=pd.DataFrame(rows,columns=['path']+label_cols+['report','study_type'])
    # rows_df['study_id']=study_id
    
    
    # Calculate and show remaining time to finish computing
    counter = counter-1
    tock=time.time()
    times.append(tock-tick)
    print('Remaining time: {}'.format(str(datetime.timedelta(seconds=counter*np.mean(times)))))
    
    return rows_df
    

In [None]:
os.makedirs('../data/raw/mimic_fusions',exist_ok=True)
counter=len(merged_o_m.groupby(['study_id']).count())
times=[]
merged_o_m.groupby(['study_id']).apply(merge_studies).reset_index(drop=True).to_csv('..data/intermediate/inter_mimic.csv')

Remaining time: 0:13:37.974027
Remaining time: 1 day, 4:08:35.000257
Remaining time: 18:49:03.052081
Remaining time: 1 day, 3:00:05.785967
Remaining time: 21:38:09.151350
Remaining time: 18:03:47.562313
Remaining time: 23:22:04.915119
Remaining time: 20:28:09.902097
Remaining time: 23:50:41.908552
Remaining time: 1 day, 2:25:20.461931
Remaining time: 1 day, 0:02:15.783571
Remaining time: 1 day, 9:57:31.271412
Remaining time: 1 day, 11:30:10.200757
Remaining time: 1 day, 10:58:36.006438
Remaining time: 1 day, 10:31:32.188823
Remaining time: 1 day, 13:02:10.818934
Remaining time: 1 day, 14:00:53.406393
Remaining time: 1 day, 16:05:48.239499
Remaining time: 1 day, 18:51:44.443918
Remaining time: 1 day, 16:43:45.098590
Remaining time: 1 day, 17:26:11.999792
Remaining time: 1 day, 15:33:39.958311
Remaining time: 1 day, 15:02:38.650126
Remaining time: 1 day, 13:25:26.968468
Remaining time: 1 day, 18:00:45.454401
Remaining time: 1 day, 16:24:16.088313
Remaining time: 1 day, 20:55:55.466901
Re

merged_inner