### Script for DDSM handling

### Importing libraries

In [44]:
# Importing libraries
import numpy as np
import cv2
import os
import pydicom as dicom
import fnmatch

## Functions

In [45]:
# Croppig borders
def cropBorders(img, l=0.02, r=0.02, u=0.04, d=0.04):
    nrows, ncols = img.shape
    # Get the start and end rows and columns
    l_crop = int(ncols * l)
    r_crop = int(ncols * (1 - r))
    u_crop = int(nrows * u)
    d_crop = int(nrows * (1 - d))
    cropped_img = img[u_crop:d_crop, l_crop:r_crop]
    return cropped_img

# MinMax Normalize
def minMaxNormalise(img):
    norm_img = (img - img.min()) / (img.max() - img.min())
    return norm_img

# Binarise Image
def Binarise(img, thresh, maxval):
    binarised_img = np.zeros(img.shape, np.uint8)
    binarised_img[img >= thresh] = maxval
    return binarised_img

# Expand to capture artefacts
def editMask(mask, ksize=(23, 23), operation="open"):
    kernel = cv2.getStructuringElement(shape=cv2.MORPH_RECT, ksize=ksize)
    if operation == "open":
        edited_mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
    elif operation == "close":
        edited_mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
    # Then dilate
    edited_mask = cv2.morphologyEx(edited_mask, cv2.MORPH_DILATE, kernel)
    return edited_mask

# Sorts contours based on area
def sortContoursByArea(contours, reverse=True):
    # Sort contours based on contour area.
    sorted_contours = sorted(contours, key=cv2.contourArea, reverse=reverse)
    # Construct the list of corresponding bounding boxes.
    bounding_boxes = [cv2.boundingRect(c) for c in sorted_contours]
    return sorted_contours, bounding_boxes

# Find largest contours
def xLargestBlobs(mask, top_x=None, reverse=True):
    # Find all contours from binarised image.
    # Note: parts of the image that you want to get should be white.
    contours, hierarchy = cv2.findContours(image=mask, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_NONE)
    n_contours = len(contours)
    # Only get largest blob if there is at least 1 contour.
    if n_contours > 0:
        if n_contours < top_x or top_x == None:
            top_x = n_contours
        # Sort contours based on contour area.
        sorted_contours, bounding_boxes = sortContoursByArea(contours=contours, reverse=reverse)
        # Get the top X largest contours.
        X_largest_contours = sorted_contours[0:top_x]
        # Create black canvas to draw contours on.
        to_draw_on = np.zeros(mask.shape, np.uint8)
        # Draw contours in X_largest_contours.
        X_large_blobs = cv2.drawContours(
            image=to_draw_on,  # Draw the contours on `to_draw_on`.
            contours=X_largest_contours,  # List of contours to draw.
            contourIdx=-1,  # Draw all contours in `contours`.
            color=1,  # Draw the contours in white.
            thickness=-1,  # Thickness of the contour lines.
        )
    return n_contours, X_large_blobs

# Apply mask to get the right image
def applyMask(img, mask):
    masked_img = img.copy()
    masked_img[mask == 0] = 0
    return masked_img

# Contrast enhancement
def clahe(img, clip=2.0, tile=(8, 8)):
    img = cv2.normalize(
        img,
        None,
        alpha=0,
        beta=255,
        norm_type=cv2.NORM_MINMAX,
        dtype=cv2.CV_32F,
    )
    img_uint8 = img.astype("uint8")
    clahe_create = cv2.createCLAHE(clipLimit=clip, tileGridSize=tile)
    clahe_img = clahe_create.apply(img_uint8)
    return clahe_img

# Image preprocess
def MammoPreprocess(img):
    # Step 1: Initial crop.
    cropped_img = cropBorders(img=img)
    crop_resized_img = cv2.resize(cropped_img, (512,512), cv2.INTER_CUBIC)
    # Step 2: Min-max normalise.
    norm_img = minMaxNormalise(img=crop_resized_img)
    # Step 3: Remove artefacts.
    binarised_img = Binarise(img=norm_img, thresh=1e-4, maxval=255)
    edited_mask = editMask(mask=binarised_img)
    _, xlargest_mask = xLargestBlobs(mask=edited_mask, top_x=1)
    masked_img = applyMask(img=norm_img, mask=xlargest_mask)
    # Step 4: CLAHE enhancement.
    clahe_img = clahe(img=masked_img)
    # Step 5: Min-max normalise.
    img_pre = minMaxNormalise(img=clahe_img)
    img_pre = img_pre*255
    return img_pre

def MaskPreprocess(mask):
    # Initial crop.
    cropped_mask = cropBorders(img=mask)
    # Resize
    mask = cv2.resize(cropped_mask, (512,512), cv2.INTER_CUBIC)
    # Find contours
    _, largest_mask = xLargestBlobs(mask=mask,top_x=5)
    # Threshold it
    _, mask = cv2.threshold(mask, 200,255, cv2.THRESH_BINARY )
    return mask


# Function of rewriting the images
def dataloader(filepath,subset):
    # Tracker
    count = 0
    # Initiliaze prev
    prev_mask = 0
    prev_path = 0
    # Path to downloaded images - path to dataset
    down_path = 'D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/CBIS-DDSM/'
    # Where to create the file with new masks and images
    home_path = 'D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/'+ subset
    # Main code
    with open(str(filepath), "r") as input_file:
        for line in input_file:
            # To ignore fist line of csv
            if count==0:
                count=count+1
                continue
            # To ignore " at each other line in csv file
            if line[0] == '"':
                continue
            # Read file and place values
            line = line.split(',')
            patient_id = line[0]
            img_view = line[3]
            cancer_type = line[9]
            filepath_img = line[11]
            filepath_mask = line[13]
            # Process paths
            filepath_img = filepath_img.replace('"','')
            filepath_img = filepath_img.split('/')
            filepath_mask = filepath_mask.replace('"', '')
            filepath_mask = filepath_mask.split('/')
            img_path = down_path + str(filepath_img[0])
            #print(img_path)
            msk_path = down_path + str(filepath_mask[0])
            # Read images
            for path, subdirs, files in os.walk(img_path):
                for name in files:
                    # Find paths
                    test_path =  os.path.join(path, name)
                    # Read Images
                    img = dicom.dcmread(test_path)
                    # Transform to numpy
                    pixel_array_numpy_img = img.pixel_array
                    img = pixel_array_numpy_img
                    #print(img.shape)
                    #print(np.unique(img))
                    # Preprocess images
                    img = MammoPreprocess(img)
                    image_path = home_path + 'images/' + filepath_img[0] + '.jpg'
                    cv2.imwrite(image_path, img)
                    print(image_path)
            # Read masks
            for path, subdirs, files in os.walk(msk_path):
                for name in files:
                    # Find paths
                    test_path =  os.path.join(path, name)
                    # Read Images
                    mask = dicom.dcmread(test_path)
                     # Transform to numpy
                    pixel_array_numpy_mask = mask.pixel_array
                    if len(np.unique(pixel_array_numpy_mask))==2:
                        # Read images and NOT CROPPED
                        mask = pixel_array_numpy_mask
                        mask_path = home_path + 'masks/' + filepath_img[0] + '.jpg'
                        # Preprocess mask
                        mask = MaskPreprocess(mask)
                        # Check for same ID
                        if prev_path == mask_path:
                            print(mask_path)
                            summed_mask = np.add(mask, prev_mask)
                            _, mask = cv2.threshold(src=summed_mask, thresh=1, maxval=255, type=cv2.THRESH_BINARY)
                        # Write path
                        cv2.imwrite(mask_path, mask)
                        # Prepare for next iteration
                        prev_path = mask_path
                        prev_mask = mask
            count = count+1
            print(count)
            if (count>=100):
                break;
    return 0

### Main code

In [46]:
filepath_1 = 'C:/Users/xatzo/Downloads/mass_case_description_train_set.csv'
filepath_2 = 'C:/Users/xatzo/Downloads/mass_case_description_test_set.csv'

result = dataloader(filepath_1, 'train/')
result = dataloader(filepath_2, 'test/')

D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00001_LEFT_CC.jpg
2
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00001_LEFT_MLO.jpg
3
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00004_LEFT_CC.jpg
4
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00004_LEFT_MLO.jpg
5
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00004_RIGHT_MLO.jpg
6
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00009_RIGHT_CC.jpg
7
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00009_RIGHT_MLO.jpg
8
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00015_LEFT_MLO.jpg
9
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Process

67
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00092_LEFT_CC.jpg
68
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00092_LEFT_MLO.jpg
69
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00092_LEFT_MLO.jpg
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/masks/Mass-Training_P_00092_LEFT_MLO.jpg
70
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00092_RIGHT_CC.jpg
71
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00092_RIGHT_MLO.jpg
72
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/images/Mass-Training_P_00092_RIGHT_MLO.jpg
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/train/masks/Mass-Training_P_00092_RIGHT_MLO.jpg
73
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Pr

26
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00145_LEFT_MLO.jpg
27
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00147_RIGHT_CC.jpg
28
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00147_RIGHT_MLO.jpg
29
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00156_RIGHT_MLO.jpg
30
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00158_RIGHT_MLO.jpg
31
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00159_RIGHT_MLO.jpg
32
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00171_RIGHT_CC.jpg
33
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00171_RIGHT_MLO.jpg
34
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test

95
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00405_LEFT_CC.jpg
96
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00405_LEFT_MLO.jpg
97
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00409_RIGHT_CC.jpg
98
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00409_RIGHT_MLO.jpg
99
D:/Windows/torrents/manifest-ZkhPvrLo5216730872708713142/Processed/test/images/Mass-Test_P_00429_LEFT_CC.jpg
100
