In [223]:
import cv2
import glob
from tqdm import tqdm
import numpy as np
import pandas as pd

import pydicom
from pydicom.pixel_data_handlers import apply_windowing

import matplotlib.pyplot as plt
%matplotlib inline

from libs.image_processing import to_3_channels, roi_extraction_cv2, show_img_and_roi, crop_img, resize_and_pad, check_mkdir

## BMCD

In [246]:
def bmcd_to_png(bmcd_dcm):
    patient_id = bmcd_dcm.split("/")[-2]
    image_id = patient_id + "_" + bmcd_dcm.split("/")[-1].split(".")[0] 
    
    dcmfile = pydicom.dcmread(bmcd_dcm)
    img, is_monochrome1 = dcmfile.pixel_array, dcmfile.PhotometricInterpretation == 'MONOCHROME1'

    img = to_3_channels(img, is_monochrome1)
    roi = roi_extraction_cv2(img)
    img = crop_img(img, roi)
    img = resize_and_pad(img)

    save_path = f"/home/FanHuang247817/train_images_png2/bmcd_{patient_id}/{image_id}.png"
    check_mkdir(save_path)
    cv2.imwrite(save_path, img)

    return "bmcd_" + patient_id, image_id

In [247]:
bmcd_dcms = glob.glob("./external/BMCD/*/*/*dcm")
bmcd_patient_ids, bmcd_image_ids = [], []
for bmcd_dcm in tqdm(bmcd_dcms):
    p_id, i_id = bmcd_to_png(bmcd_dcm)
    bmcd_patient_ids.append(p_id)
    bmcd_image_ids.append(i_id)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 154/154 [01:39<00:00,  1.55it/s]


## CMMD

In [259]:
def cmmd_to_png(cmmd_dcm):
    patient_id = cmmd_dcm.split("/")[-4]
    image_id = patient_id + "_" + cmmd_dcm.split("/")[-1][:-4]
    
    dcmfile = pydicom.dcmread(cmmd_dcm)
    img, is_monochrome1 = dcmfile.pixel_array, dcmfile.PhotometricInterpretation == 'MONOCHROME1'
    img = apply_windowing(img, dcmfile)

    img = to_3_channels(img, is_monochrome1)
    roi = roi_extraction_cv2(img)
    img = crop_img(img, roi)
    img = resize_and_pad(img)

    save_path = f"/home/FanHuang247817/train_images_png2/{patient_id}/{image_id}.png"
    check_mkdir(save_path)
    cv2.imwrite(save_path, img)

    return patient_id, image_id

In [260]:
df_cmmd = pd.read_excel("/home/FanHuang247817/rsna_breast/external/CMMD/CMMD_clinicaldata_revision.xlsx")
df_cmmd.head()

Unnamed: 0,ID1,LeftRight,Age,number,abnormality,classification,subtype
0,D1-0001,R,44,2,calcification,Benign,
1,D1-0002,L,40,2,calcification,Benign,
2,D1-0003,L,39,2,calcification,Benign,
3,D1-0004,L,41,2,calcification,Benign,
4,D1-0005,R,42,2,calcification,Benign,


In [261]:
cmmd_malignant = df_cmmd.loc[df_cmmd["classification"]=="Malignant", "ID1"].tolist()
cmmd_dcms = glob.glob("/home/FanHuang247817/rsna_breast/external/CMMD/CMMD/*/*/*/*.dcm")

cmmd_malignant_dcms = []
for cmmd_dcm in cmmd_dcms:
    if cmmd_dcm.split("/")[-4] in set(cmmd_malignant):
        cmmd_malignant_dcms.append(cmmd_dcm)

In [262]:
cmmd_patient_ids, cmmd_image_ids = [], []
for cmmd_dcm in tqdm(cmmd_malignant_dcms):
    p_id, i_id = cmmd_to_png(cmmd_dcm)
    cmmd_patient_ids.append(p_id), cmmd_image_ids.append(i_id)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1654/1654 [09:16<00:00,  2.97it/s]


## DDSM

In [300]:
def ddsm_to_png(ddsm_png):
    patient_id = "ddsm_" + ddsm_png.split("/")[-2]
    image_id = ddsm_png.split("/")[-1]
    
    img = cv2.imread(ddsm_png)
    roi = roi_extraction_cv2(img)
    img = crop_img(img, roi)
    img = resize_and_pad(img)

    save_path = f"/home/FanHuang247817/train_images_png2/{patient_id}/{image_id}"
    check_mkdir(save_path)
    cv2.imwrite(save_path, img)

    return patient_id, image_id[:-4]

In [301]:
ddsm_pngs = glob.glob("/home/FanHuang247817/rsna_breast/external/DDSM/*/*.png")
ddsm_pos = [png.replace("_Mask2", "") for png in ddsm_pngs if "Mask" in png]
ddsm_pos = [png.replace("_Mask", "") for png in ddsm_pos if "Mask" in png]

In [302]:
ddsm_patient_ids, ddsm_image_ids = [], []
for ddsm in tqdm(ddsm_pos):
    p_id, i_id = ddsm_to_png(ddsm)
    ddsm_patient_ids.append(p_id), ddsm_image_ids.append(i_id)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1429/1429 [06:31<00:00,  3.65it/s]


## Generate Pickle 

In [283]:
patient_ids = ddsm_patient_ids + cmmd_patient_ids + cmmd_patient_ids
image_ids = ddsm_image_ids + cmmd_image_ids + cmmd_image_ids
print(len(patient_ids), len(image_ids))

4737 4737


In [293]:
site_id = [1] * len(patient_ids)
laterality = ["L"] * len(patient_ids)
view = ["CC"] * len(patient_ids)
age = [42.0] * len(patient_ids)
cancer = [1] * len(patient_ids)
biopsy = [0] * len(patient_ids)
invasive = [0] * len(patient_ids)
BIRADS = [1.0] * len(patient_ids)
implant = [0] * len(patient_ids)
density = ["C"] * len(patient_ids)
machine_id = [49] * len(patient_ids)
difficult_negative_case = [False] * len(patient_ids)
patient_view = ["666L"] * len(patient_ids)

dict_ = {'site_id': site_id, 'patient_id': patient_ids, 'image_id': image_ids, 
        'laterality': laterality, "view": view, "age": age, "cancer": cancer, "biopsy": biopsy,
        'invasive': invasive, 'BIRADS': BIRADS, 'implant': implant, "density": density, "machine_id": machine_id,
        'difficult_negative_case': difficult_negative_case, 'patient_view': patient_view} 
   
df_ex = pd.DataFrame(dict_)

In [294]:
df_ex.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,patient_view
0,1,ddsm_1488,A_1488_1.LEFT_MLO,L,CC,42.0,1,0,0,1.0,0,C,49,False,666L
1,1,ddsm_0133,C_0133_1.LEFT_MLO,L,CC,42.0,1,0,0,1.0,0,C,49,False,666L
2,1,ddsm_0133,C_0133_1.LEFT_CC,L,CC,42.0,1,0,0,1.0,0,C,49,False,666L
3,1,ddsm_1626,A_1626_1.LEFT_MLO,L,CC,42.0,1,0,0,1.0,0,C,49,False,666L
4,1,ddsm_1626,A_1626_1.LEFT_CC,L,CC,42.0,1,0,0,1.0,0,C,49,False,666L


In [296]:
df_ex.to_pickle(f'./df/train_ex.pkl')