In [1]:
import pandas as pd
from PIL import Image
import cv2
import numpy as np
import random
from tqdm import tqdm
import os

def rle2mask(mask_rle: str, label=1, shape=(3520,4280)):
    """
    mask_rle: run-length as string formatted (start length)
    shape: (height,width) of array to return
    Returns numpy array, 1 - mask, 0 - background

    """
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths

    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = label
    return img.reshape(shape)# Needed to align to RLE direction


def mask2rle(img):
    """
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formatted
    """
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

def decode_both_lungs(row, label=1):

    right = rle2mask(
        mask_rle=row["Right Lung"],
        label=label,
        shape=(int(row["Height"]),int(row["Width"]))
    )

    left = rle2mask(
        mask_rle=row["Left Lung"],
        label=label,
        shape=(int(row["Height"]),int(row["Width"]))
    )

    return right + left


def bounding_box(image, label=1):
    _image = image.copy()
    segmentation = np.where(_image == label)
    padding = random.randint(100,200)

    if len(segmentation) != 0 and len(segmentation[1]) != 0 and len(segmentation[0]) != 0:
        x_min = max(int(np.min(segmentation[1]) - padding), 0)
        x_max = min(int(np.max(segmentation[1]) + padding), len(segmentation[1])-1)
        y_min = max(int(np.min(segmentation[0]) - padding), 0)
        y_max = min(int(np.max(segmentation[0]) + padding), len(segmentation[0])-1)


    return x_min, y_min , x_max, y_max

def expand2square(pil_img, background_color):
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height) // 2))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width) // 2, 0))
        return result

def crop_and_resize(image, patient_record,target_size):
    mask = decode_both_lungs(patient_record)
    bbox = bounding_box(mask)

    cropped_image = image.crop(bbox)
    cropped_image.thumbnail(target_size)

    cropped_resized_image = expand2square(cropped_image, 0)

    return cropped_resized_image



In [2]:
mask_dir = "/home/data_shares/purrlab/physionet.org/files/chexmask-cxr-segmentation-data/0.2/OriginalResolution/"
input_dir = "/home/data_shares/purrlab_students/ChestX-ray14/"
output_dir = "/home/caap/Thesis-Synthex/data/chextx-ray14/"

In [3]:
chestx_ray_mask = pd.read_csv(mask_dir + 'ChestX-Ray8.csv')
chestx_ray_df = pd.read_csv(input_dir + "Data_Entry_2017.csv")

In [4]:
labels_to_encode = ['Effusion', 'Pneumothorax', 'Atelectasis', 'Cardiomegaly', 'Pneumonia']

def encode_labels(row, label, column_name):
    if label in row[column_name].split("|"):
        return 1
    else:
        return 0

for label in labels_to_encode:
    chestx_ray_df[label] = chestx_ray_df.apply(encode_labels, args=(label, 'Finding Labels'), axis=1)


In [5]:
chestx_ray_mask

Unnamed: 0,Image Index,Dice RCA (Mean),Dice RCA (Max),Landmarks,Left Lung,Right Lung,Heart,Height,Width
0,00025787_047.png,0.826465,0.870102,"488,115,449,118,411,132,374,153,344,177,324,20...",200402 3 201422 11 202442 18 203463 24 204483 ...,118243 8 119254 24 120265 40 121281 51 122302 ...,448036 9 449048 27 450064 41 451086 48 452107 ...,1024,1024
1,00026251_000.png,0.833988,0.928886,"426,124,386,128,345,146,303,171,270,203,246,23...",136822 22 137826 46 138848 52 139870 58 140892...,127399 7 128413 21 129427 35 130441 49 131458 ...,445964 26 446985 33 448007 40 449028 47 450049...,1024,1024
2,00026194_002.png,0.759308,0.794598,"439,217,402,226,371,245,345,271,324,297,311,32...",189085 9 190096 27 191113 39 192135 45 193158 ...,221645 20 222646 45 223666 50 224686 56 225706...,395796 5 396816 13 397836 21 398856 29 399877 ...,1024,1024
3,00025227_012.png,0.903819,0.930990,"397,79,357,82,314,98,274,128,236,165,212,205,1...",100976 22 101998 45 103020 49 104042 53 105064...,81288 8 82299 23 83309 40 84325 50 85346 55 86...,498141 13 499161 29 500181 36 501201 43 502221...,1024,1024
4,00028166_003.png,0.850303,0.869093,"392,180,374,184,353,201,327,225,304,250,284,27...",163435 4 164455 12 165475 20 166496 27 167516 ...,184711 4 185731 12 186750 20 187770 27 188791 ...,464455 11 465455 37 466458 59 467469 74 468485...,1024,1024
...,...,...,...,...,...,...,...,...,...
112115,00020215_002.png,0.859157,0.889629,"367,121,327,126,286,140,249,159,217,186,192,21...",125560 14 126562 42 127573 62 128595 71 129616...,124269 7 125285 23 126301 39 127317 54 128333 ...,419305 35 420324 52 421343 64 422361 74 423380...,1024,1024
112116,00019871_002.png,0.849452,0.888343,"351,185,321,196,293,228,262,268,236,312,221,35...",168575 10 169581 32 170595 45 171617 51 172640...,189791 3 190812 9 191834 14 192855 20 193876 2...,655879 23 656887 57 656966 13 657909 96 658931...,1024,1024
112117,00019721_000.png,0.866146,0.910598,"389,125,341,131,294,149,249,178,207,214,177,25...",137891 13 138892 40 139903 57 140925 63 141947...,128387 5 129403 17 130419 29 131435 41 132451 ...,496143 10 497160 29 498176 48 499193 64 500212...,1024,1024
112118,00020062_001.png,0.852984,0.902476,"396,169,361,173,326,186,292,207,262,233,238,26...",169610 10 170622 29 171633 48 172649 63 173671...,173449 7 174464 21 175480 34 176495 48 177513 ...,449037 13 450055 34 451073 50 452091 63 453111...,1024,1024


In [6]:
chestx_ray_df[chestx_ray_df["View Position"] == "PA"][labels_to_encode].sum()

Effusion        6589
Pneumothorax    3407
Atelectasis     5728
Cardiomegaly    1563
Pneumonia        630
dtype: int64

In [7]:
chestx_ray_joined = chestx_ray_df[chestx_ray_df["View Position"] == "PA"].merge(chestx_ray_mask, how="inner", on="Image Index")

In [8]:
folders = [ folder for folder in os.listdir(input_dir) if not folder.endswith(".csv")]
files = [[[folder+ "/images/", file] for file in os.listdir(input_dir + folder+ "/images/")] for folder in folders]
files = [x for xs in files for x in xs]

In [9]:
chestx_ray_file_df = pd.DataFrame(files, columns=["Folder","Image Index"])

In [10]:
chestx_ray_joined = chestx_ray_joined.merge(chestx_ray_file_df, how="inner", on="Image Index")

In [13]:
chestx_ray_joined.to_csv("chestx_ray14_test.csv")