In [1]:
import pandas as pd
import numpy as np
import os
import pydicom
from tqdm import tqdm
from skimage.io import imsave
from skimage.transform import resize

In [2]:
data_path = r'D:\Dataset\DukeDataset\ClassicDataPath\manifest-1654812109500'
boxes_path = r'D:\Dataset\DukeDataset\OtherFiles\Annotation_Boxes.CSV'
mapping_path = r'D:\Dataset\DukeDataset\OtherFiles\Breast-Cancer-MRI-filepath_filename-mapping.CSV'
target_png_dir = r'C:\Users\USER\Desktop\Directory\DukedataProcessing\png_out_64'
if not os.path.exists(target_png_dir):
	os.makedirs(target_png_dir)

In [3]:
boxes_df = pd.read_csv(boxes_path)
display(boxes_df)

Unnamed: 0,Patient ID,Start Row,End Row,Start Column,End Column,Start Slice,End Slice
0,Breast_MRI_001,234,271,308,341,89,112
1,Breast_MRI_002,251,294,108,136,59,72
2,Breast_MRI_003,351,412,82,139,96,108
3,Breast_MRI_004,262,280,193,204,86,95
4,Breast_MRI_005,188,213,138,178,76,122
...,...,...,...,...,...,...,...
917,Breast_MRI_918,345,395,338,395,62,85
918,Breast_MRI_919,285,312,369,397,98,109
919,Breast_MRI_920,172,193,337,355,87,101
920,Breast_MRI_921,328,374,404,446,97,121


In [4]:
# only consider fat-satured "pre" MR exams
mapping_df = pd.read_csv(mapping_path)
mapping_df = mapping_df[mapping_df['original_path_and_filename'].str.contains('pre')] # Boolean indexing

# remove entries from patients that we are not including (we only include patients 1 to 100)
# join을 통한 문자열 연결
# '|'이 들어간 문자열은 대개 정규 표현식에서 사용되는데, 문자열 중 어느 하나라도 포함되어 있으면 match된다는 의미. 
crossref_pattern = '|'.join(["DICOM_Images/Breast_MRI_{:03d}".format(s) for s in list(range(1, 101))])
mapping_df = mapping_df[mapping_df['original_path_and_filename'].str.contains(crossref_pattern)]

# mapping_df는 pre가 들어간 dicom파일로 boolean indexing된 후에 "DICOM_Images/Breast_MRI_001~100"으로 한 번더 boolean indexing된다.
# mapping_df의 각 행은 하나의 전체 3D MRI 볼륨의 각각의 슬라이스를 나타낸다.

  mapping_df = pd.read_csv(mapping_path)


In [5]:
display(mapping_df)

Unnamed: 0,sop_instance_UID,original_path_and_filename,classic_path,descriptive_path,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,series_sort
640,1.3.6.1.4.1.14519.5.2.1.1622543824533841316221...,DICOM_Images/Breast_MRI_001/pre/Breast_MRI_001...,Duke-Breast-Cancer-MRI/Breast_MRI_001/1.3.6.1....,Duke-Breast-Cancer-MRI/BreastMRI001/01-01-1990...,,,,,,,,,,,,
641,1.3.6.1.4.1.14519.5.2.1.2095737470789034615864...,DICOM_Images/Breast_MRI_001/pre/Breast_MRI_001...,Duke-Breast-Cancer-MRI/Breast_MRI_001/1.3.6.1....,Duke-Breast-Cancer-MRI/BreastMRI001/01-01-1990...,,,,,,,,,,,,
642,1.3.6.1.4.1.14519.5.2.1.7648352393594634493272...,DICOM_Images/Breast_MRI_001/pre/Breast_MRI_001...,Duke-Breast-Cancer-MRI/Breast_MRI_001/1.3.6.1....,Duke-Breast-Cancer-MRI/BreastMRI001/01-01-1990...,,,,,,,,,,,,
643,1.3.6.1.4.1.14519.5.2.1.2333724865431423854264...,DICOM_Images/Breast_MRI_001/pre/Breast_MRI_001...,Duke-Breast-Cancer-MRI/Breast_MRI_001/1.3.6.1....,Duke-Breast-Cancer-MRI/BreastMRI001/01-01-1990...,,,,,,,,,,,,
644,1.3.6.1.4.1.14519.5.2.1.3037519639975787441522...,DICOM_Images/Breast_MRI_001/pre/Breast_MRI_001...,Duke-Breast-Cancer-MRI/Breast_MRI_001/1.3.6.1....,Duke-Breast-Cancer-MRI/BreastMRI001/01-01-1990...,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83129,1.3.6.1.4.1.14519.5.2.1.3043945846749761802398...,DICOM_Images/Breast_MRI_100/pre/Breast_MRI_100...,Duke-Breast-Cancer-MRI/Breast_MRI_100/1.3.6.1....,Duke-Breast-Cancer-MRI/BreastMRI100/01-01-1990...,,,,,,,,,,,,
83130,1.3.6.1.4.1.14519.5.2.1.1170544914161985772074...,DICOM_Images/Breast_MRI_100/pre/Breast_MRI_100...,Duke-Breast-Cancer-MRI/Breast_MRI_100/1.3.6.1....,Duke-Breast-Cancer-MRI/BreastMRI100/01-01-1990...,,,,,,,,,,,,
83131,1.3.6.1.4.1.14519.5.2.1.1532615816262730992890...,DICOM_Images/Breast_MRI_100/pre/Breast_MRI_100...,Duke-Breast-Cancer-MRI/Breast_MRI_100/1.3.6.1....,Duke-Breast-Cancer-MRI/BreastMRI100/01-01-1990...,,,,,,,,,,,,
83132,1.3.6.1.4.1.14519.5.2.1.2656617917685995096480...,DICOM_Images/Breast_MRI_100/pre/Breast_MRI_100...,Duke-Breast-Cancer-MRI/Breast_MRI_100/1.3.6.1....,Duke-Breast-Cancer-MRI/BreastMRI100/01-01-1990...,,,,,,,,,,,,


In [6]:
def save_dcm_slice(dcm_fname, label, vol_idx, vol_png_path):
    # create a path to save the slice .png file in, according to the original DICOM filename and target label
    png_path = dcm_fname.split('\\')[-1].replace('.dcm', '-{}.png'.format(vol_idx)) # ex) png_path = 1-001-1.png
    label_dir = 'pos' if label == 1 else 'neg'
    png_path = os.path.join(vol_png_path, label_dir, png_path)

    if not os.path.exists(png_path):
        # only make the png image if it doesn't already exist (if you're running this after the first time)

        # load DICOM file with pydicom library
        try:            
            dcm = pydicom.dcmread(dcm_fname)
        except FileNotFoundError:
            # fix possible errors in filename from list
            dcm_fname_split = dcm_fname.split('\\')
            dcm_fname_end = dcm_fname_split[-1]
            assert dcm_fname_end.split('-')[1][0] == '0'

            dcm_fname_end_split = dcm_fname_end.split('-')
            dcm_fname_end = '-'.join([dcm_fname_end_split[0], dcm_fname_end_split[1][1:]])

            dcm_fname_split[-1] = dcm_fname_end
            dcm_fname = '\\'.join(dcm_fname_split)
            dcm = pydicom.dcmread(dcm_fname)


        # DICOM을 픽셀 intensity 값의 숫자 배열로 변환한다.
        img = dcm.pixel_array
        img = img.astype(float) * 255. / img.max() 
        # convert from float -> uint8        
        img = img.astype(np.uint8)
        
        # invert image if necessary, according to DICOM metadata
        # 필요한 경우 DICOM metadat를 사용해 invert한다.
        # https://dicom.innolitics.com/ciods/rt-dose/image-pixel/00280004
        img_type = dcm.PhotometricInterpretation
        if img_type == "MONOCHROME1":
            img = np.invert(img) # 2의 보수를 반환하여 numpy 배열을 반전시킴. 의료용 이미지 처리에서 이미지 색상이 검은색인 경우 해당 색상을 흰색으로 변환한다.

        # 이미지 크기를 64x64로 변환한다
        img_resized = resize(img, (64, 64), anti_aliasing=True)
        img_resized = (img_resized * 255).astype(np.uint8)  # 다시 uint8로 변환

        # save final .png
        # imsave(png_path, img)
        imsave(png_path, img_resized)

In [7]:
# 각 클래스에서 추출된 실험의 개수
ct_negative = 0
ct_positive = 0

# 각 환자 볼륨의 반복 인덱스 초기화
vol_idx = -1

for row_idx, row in tqdm(mapping_df.iterrows(), total=len(mapping_df)): # 전체 데이터 저장
    # indices start at 1 here
    """
    'original_path_and_filename' : ex) DICOM_Images/Breast_MRI_001/post_1/Breast_MRI_001_post_1_001.dcm 

    new_vol_idx: 'Breast_MRI_001'-> '001' 
    slice_idx: 'Breast_MRI_001_pre_001.dcm' -> '001.dcm' -> .dcm을 '' (공백으로). 
    new_vol_idx, slice_idx 둘 다 int로 변환될 때, 세 글자 포맷팅이 날라간다. (001이 아니라 1로 저장.)
    """
    # 환자 전체 데이터인 볼륨의 인덱스와 볼륨의 슬라이스 인덱스를 추출한다.
    new_vol_idx = int((row['original_path_and_filename'].split('/')[1]).split('_')[-1])
    slice_idx = int(((row['original_path_and_filename'].split('/')[-1]).split('_')[-1]).replace('.dcm', ''))    

    # volume의 start, end slice를 지정하는 것이라 볼륨의 각 dcm파일을 모두 순회해야 다음 볼륨으로 넘어간다.
    if new_vol_idx != vol_idx:
        box_row = boxes_df.iloc[[new_vol_idx-1]] # iloc: int인덱스로 행 선택. iloc[]안에 []리스트를 사용하면 결과가 DataFrame으로 반환된다. []로 감싸지 않으면 series로 반환.
        start_slice = int(box_row['Start Slice'].iloc[0]) # 엑셀의 2행 기준으로 값: 89
        end_slice = int(box_row['End Slice'].iloc[0]) # 엑셀의 2행 기준으로 값: 112
              
        assert end_slice >= start_slice # assertion에서 조건이 false면 프로그램을 중단하고 에러 메시지를 출력한다.
    vol_idx = new_vol_idx
    
    vol_png_path = os.path.join(target_png_dir, str(vol_idx))
    if not os.path.exists(vol_png_path):
        os.makedirs(vol_png_path)
        os.makedirs(os.path.join(vol_png_path, 'pos'))
        os.makedirs(os.path.join(vol_png_path, 'neg'))

    # get DICOM filename
    dcm_fname = str(row['classic_path'])
    dcm_fname = dcm_fname.replace("/", "\\")
    dcm_fname = os.path.join(data_path, dcm_fname)

    # 슬라이스 레이블 결정:
    # (1) 3D box범위 안이면, 양성으로 저장
    if slice_idx >= start_slice and slice_idx < end_slice: # slice_idx가 tumor 범위 안에 있으면 true.
        save_dcm_slice(dcm_fname, 1, vol_idx, vol_png_path)
        ct_positive += 1

    # (2) 3D box 범위에서 5슬라이스 이상 차이나면 음성으로 저장 
    elif (slice_idx + 5) <= start_slice or (slice_idx - 5) > end_slice:
        save_dcm_slice(dcm_fname, 0, vol_idx, vol_png_path)
        ct_negative += 1

100%|██████████| 17116/17116 [01:59<00:00, 143.13it/s]
