**Dataset Extraction & Augmentation** \\
Spring 2024, Senior Thesis \\
Christian Granados

Details the datasets used for image segmentation and links to their sources. This notebook will also outline and preform data augmentation techniques to these datasets due to their low volume. Techniques will be guided by modifications preformed in research papers within the literature review portion of this senior thesis.

HAM10000 is a skin lesion, biomedical image segmentation dataset. This was choosen for its large amount of data, large diversity of data, and expert-tuned binary masks (of which the process for generation has been detailed in a research paper)

[Link to Harvard Dataverse Page](https://dataverse.harvard.edu/dataset.xhtml;jsessionid=38ff811c3afc0eb08b5092945918?persistentId=doi:10.7910/DVN/DBW86T) \\
[Link to Segmentation Masks Dataset](https://www.kaggle.com/datasets/tschandl/ham10000-lesion-segmentations) \\
[Human–Computer Collaboration for Skin Cancer Recognition](https://www.nature.com/articles/s41591-020-0942-0)

**Methods**
- Addition of Random Noise (Salt & Paper)
- Subselection via Cropping
- Flipping, Rotation, and Scaling
- Brightness & Contrast

In [None]:
from google.colab.patches import cv2_imshow
from collections import defaultdict
import matplotlib.pyplot as plt
from google.colab import files
from scipy.stats import norm
import seaborn as sns
import pandas as pd
import numpy as np
import random
import string
import glob
import cv2
import os
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def add_random_sp_noise(img, prop = 0.005):
    '''
    Adds salt and pepper noise to the inserted RGB image.
    Prop denotes the proportion of the total image
    that is composed of noise. Will iterate through all
    channels an image has. Assumed pixel values
    between 0 and 255.

    img - cv2.Mat object or numpy array
    prop - float between 0 and 1 (inclusive)
    '''
    imgc = img.copy()

    b, w = 0, 255 # Greyscale
    if imgc.shape[2] == 3: #RGB
        b = np.array([0, 0, 0], dtype='uint8')
        w = np.array([255, 255, 255], dtype='uint8')
    probs = np.random.random(imgc.shape[:2])
    imgc[probs < (prop / 2)] = b
    imgc[probs > (1 - (prop / 2))] = w

    return imgc

In [None]:
def random_crop_selection(img, mask, sx, sy):
    '''
    Returns a random sub-selection of the image and
    ground-truth mask that is sx wide and sy tall.

    img, mask - cv2.Mat objects or numpy arrays
    sx, sy - integers > 0 and < size of image

    '''
    imgc = img.copy()
    maskc = mask.copy()
    sx, sy = int(sx), int(sy)

    h, w, c = imgc.shape
    hhigh, whigh = w - sx, h - sy

    rx = np.random.randint(low = 0, high = whigh, size = 1, dtype = int)[0]
    ry = np.random.randint(low = 0, high = hhigh, size = 1, dtype = int)[0]

    return (
        imgc[ry : ry + sy, rx: rx + sx],
        maskc[ry : ry + sy, rx: rx + sx]
        )

In [None]:
def rand_flip_rot_scale(img, mask, flip = True, rotate = True, rot_max = 360, scale = True, scal_min = 0.8, scal_max = 2.0):
    '''
    Depending on whther flip, rotate, and scale are
    set to true for the passed in image and mask, will
    apply a random flip (horiz, vert, both, none), rotate
    by a uniformly random degree amount [0, 360], and
    scale the image by a uniformly random float values
    on [scal_min, scal_max]

    img, mask - cv2.Mat objects or numpy arrays
    flip, rotate, scale - boolean; whether to use functionality or not
    rot_max - integer [0, 360]; maximum degrees to rotate image by
    scal_min, scale_max - float [0, inf); minimum and maximum scale factors
    '''
    imgc = img.copy()
    maskc = mask.copy()
    h, w, c = imgc.shape

    t = np.random.randint(low = -2, high = 2, size = 1, dtype = int)[0]
    if flip and (t != -2):
        imgc = cv2.merge([cv2.flip(imgc[:, :, i], t) for i in range(c)])
        maskc = cv2.merge([cv2.flip(maskc[:, :, i], t) for i in range(c)])

    scl = 1.0
    if scale:
        scl = np.random.random_sample(size = 1)[0] * (scal_max - scal_min) + scal_min

    if rotate:
        r = np.random.randint(low = 0, high = rot_max + 1, size = 1, dtype = int)[0]
        h, w, c = imgc.shape
        center = (w // 2, h // 2)
        rot_mat = cv2.getRotationMatrix2D(center = center, angle = r, scale = scl)
        imgc = cv2.warpAffine(imgc, rot_mat, (w, h))
        maskc = cv2.warpAffine(maskc, rot_mat, (w, h))

    return (imgc, maskc)


In [None]:
def rand_brig_cont(img, brig_min = -75, brig_max = 75, cont_min = 0.7, cont_max = 1.3):
    '''
    For an image and mask, will uniformly
    select a brightness modification value
    between brig_min and brig_max. Then a
    constrast value between cont_min and
    cont_max that details constrast.

    img, mask - cv2.Mat or Numpy array
    brig_min, brig_max - (-inf, inf) float
    cont_min, cont_max - float (0, inf]
    '''
    imgc = img.copy()

    c = np.random.random_sample(size = 1)[0] * (cont_max - cont_min) + cont_min
    b = np.random.random_sample(size = 1)[0] * (brig_max - brig_min) + brig_min
    adj_imgc = cv2.convertScaleAbs(imgc, alpha = c, beta = b)

    return adj_imgc

Retrieves mask and images file names from downloaded HAM10000 dataset. Removes duplicates, non-matching images/masks, or those in the wrong format.

In [None]:
imgs_list = glob.glob('/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS/*')
masks_list = glob.glob('/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS/*')

a = {}
for f in imgs_list:
    try:
        cf = re.search(r"0(\d+)\.jpg", f).group(1)
        if cf not in a:
            a[cf] = 1
        else:
            os.remove(f)
    except:
        print(f)
        os.remove(f)

for f in masks_list:
    try:
        cf = re.search(r"0(\d+)_segmentation\.png", f).group(1)
        if cf not in a:
            os.remove(f)
        else:
            a[cf] -= 1
    except:
        print(f)
        os.remove(f)

for k, v in a.items():
    if v == 1:
        os.remove("/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS/ISIC_0" + str(k) + ".jpg")
    elif v == -1:
        os.remove("/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS/ISIC_0" + str(k) + "_segmentation.png")

Resizing images from 450 x 600 $\to$ 256 x 256. Initially the images will be center cropped to get images of size 450 x 450, then rescaled to 256 x 256. Square images of a power of two (2) will make model building signficantly easier.  

In [None]:
i = 0
for f in imgs_list:
    img = cv2.imread(f).copy()
    h, w, c = img.shape

    if h > 450:
        d = (h - 450) // 2
        img = img[d:d+450, :, :]

    if w > 450:
        d = (w - 450) // 2
        img = img[:, d:d+450, :]

    resimg = cv2.resize(img, (256, 256), interpolation = cv2.INTER_CUBIC)
    fid = "0" + re.search(r"0(\d+)\.jpg", f).group(1)
    fpath = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_CROP/ISIC_" + fid + ".jpg"
    tmp = cv2.imwrite(fpath, resimg)
    if not tmp:
        print(tmp, i)
    i += 1


In [None]:
i = 0
for f in masks_list:
    img = cv2.imread(f).copy()
    h, w, c = img.shape

    if h > 450:
        d = (h - 450) // 2
        img = img[d:d+450, :, :]

    if w > 450:
        d = (w - 450) // 2
        img = img[:, d:d+450, :]

    resimg = cv2.resize(img, (256, 256), interpolation = cv2.INTER_CUBIC)
    fid = "0" + re.search(r"0(\d+)_segmentation\.png", f).group(1)
    fpath = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_CROP/ISIC_" + fid + "_segmentation.png"
    tmp = cv2.imwrite(fpath, resimg)
    if not tmp:
        print(tmp, i)
    i += 1

**Original Dataset:** 10k \\
**Salt & Pepper Noise:** 2.5K \\
**Random Crop + Salt & Pepper Noise:** 2.5k \\
**Flip, Rotate, Scale + Salt & Pepper Noise:** 2.5k \\
**Brightness, Contrast + Salt & Pepper Noise:** 2.5k

In [None]:
if not os.path.exists("/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_AUG/"):
    os.mkdir("/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_AUG/")

if not os.path.exists("/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_AUG/"):
    os.mkdir("/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_AUG/")

In [None]:
i = 0
for f in random.sample(imgs_list, 2500):
    uuid = ''.join(random.choices(string.ascii_uppercase + string.digits, k = 10))
    fid = "0" + re.search(r"0(\d+)\.jpg", f).group(1)

    fnid = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_CROP/ISIC_" + fid + ".jpg"
    mid = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_CROP/ISIC_" + fid + "_segmentation.png"

    f_nu = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_AUG/IMGS_SP_" + uuid + ".jpg"
    m_nu = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_AUG/MASKS_SP_" + uuid + ".jpg"

    img = cv2.imread(fnid)
    mask = cv2.imread(mid)

    img = cv2.imread(fnid)
    mask = cv2.imread(mid)

    nimg = add_random_sp_noise(img, prop = 0.005)

    a = cv2.imwrite(f_nu, nimg)
    b = cv2.imwrite(m_nu, mask)

    if ((i % 10) == 0) or not a or not b:
        print(i, a, b)
    i += 1

In [None]:
i = 0
for f in random.sample(imgs_list, 2500):
    uuid = ''.join(random.choices(string.ascii_uppercase + string.digits, k = 10))
    fid = "0" + re.search(r"0(\d+)\.jpg", f).group(1)

    fnid = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS/ISIC_" + fid + ".jpg"
    mid = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS/ISIC_" + fid + "_segmentation.png"

    f_nu = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_AUG/IMGS_CROP_SP_" + uuid + ".jpg"
    m_nu = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_AUG/MASKS_CROP_SP_" + uuid + ".jpg"

    img = cv2.imread(fnid)
    mask = cv2.imread(mid)

    cimg, cmask = random_crop_selection(img, mask, 256, 256)
    nimg = add_random_sp_noise(cimg, prop = 0.005)

    a = cv2.imwrite(f_nu, nimg)
    b = cv2.imwrite(m_nu, cmask)

    if ((i % 10) == 0) or not a or not b:
        print(i, a, b)
    i += 1

In [None]:
i = 0
fnames = random.sample(imgs_list, 2500)
while i < 2500:
    f = fnames[i]
    uuid = ''.join(random.choices(string.ascii_uppercase + string.digits, k = 10))
    fid = "0" + re.search(r"0(\d+)\.jpg", f).group(1)

    fnid = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_CROP/ISIC_" + fid + ".jpg"
    mid = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_CROP/ISIC_" + fid + "_segmentation.png"

    f_nu = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_AUG/IMGS_ROT_SP_" + uuid + ".jpg"
    m_nu = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_AUG/MASKS_ROT_SP_" + uuid + ".jpg"

    img = cv2.imread(fnid)
    mask = cv2.imread(mid)

    try:
        cimg, cmask = rand_flip_rot_scale(img, mask)
        nimg = add_random_sp_noise(cimg, prop = 0.005)

        a = cv2.imwrite(f_nu, nimg)
        b = cv2.imwrite(m_nu, cmask)

        if ((i % 10) == 0) or not a or not b:
            print(i, a, b)
        i += 1
    except:
        print(i, "Rotate, Flip, Scale Failed")

In [None]:
i = 0
fnames = random.sample(imgs_list, 2500)
while i < 2500:
    f = fnames[i]
    uuid = ''.join(random.choices(string.ascii_uppercase + string.digits, k = 10))
    fid = "0" + re.search(r"0(\d+)\.jpg", f).group(1)

    fnid = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_CROP/ISIC_" + fid + ".jpg"
    mid = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_CROP/ISIC_" + fid + "_segmentation.png"

    f_nu = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_AUG/IMGS_BRIG_SP_" + uuid + ".jpg"
    m_nu = r"/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_AUG/MASKS_BRIG_SP_" + uuid + ".jpg"

    img = cv2.imread(fnid)
    mask = cv2.imread(mid)

    try:
        cimg = rand_brig_cont(img)
        nimg = add_random_sp_noise(cimg, prop = 0.005)

        a = cv2.imwrite(f_nu, nimg)
        b = cv2.imwrite(m_nu, mask)

        if ((i % 10) == 0) or not a or not b:
            print(i, a, b)
        i += 1
    except:
        print(i, "Brightness, Constrast Failed")

To make querying and loading data easier in the Dataset/Dataloader portion of the model notebook, included is a CSV that details an index, image path, and corresponding mask path.

In [None]:
imgs_crop_list = glob.glob('/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_CROP/*')
masks_crop_list = glob.glob('/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_CROP/*')
imgs_aug_list = glob.glob('/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_AUG/*')
masks_aug_list = glob.glob('/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_AUG/*')

In [None]:
imgs_crop_list, masks_crop_list, imgs_aug_list, masks_aug_list = sorted(imgs_crop_list), sorted(masks_crop_list), sorted(imgs_aug_list), sorted(masks_aug_list)

In [None]:
from skimage import io
from skimage.color import rgb2gray
import torchvision.transforms as T
import torch
t = rgb2gray(io.imread(masks_crop_list[0]))

In [None]:
c = T.ToTensor()(t).bool()
c_inv = ~c

In [None]:
d = torch.cat((c.int(), (~c).int()), dim = 0)

In [None]:
d

tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]], dtype=torch.int32)

In [None]:
def img_to_mask_path(img):

    start = "/".join(re.split(r"/", img)[:6])

    # Augmented images
    if "HAM10000_IMGS_AUG" in img:
        mid = "/HAM10000_MASKS_AUG/MASKS_"
        end = "_".join(list(filter(lambda s: not (r"/" in s), img.split("_")[-3:])))

    # Original Images (Cropped)
    else:
        mid = "/HAM10000_MASKS_CROP/ISIC_"
        end = "_".join(masks_crop_list[10].split("_")[-2:])

    return start + mid + end

In [None]:
a = defaultdict(lambda : 0)
res = pd.DataFrame(columns = ["img_path", "mask_path"])

for mask in masks_crop_list + masks_aug_list:
    a[mask] = 1

for img in imgs_crop_list + imgs_aug_list:
    cmask = img_to_mask_path(img)
    if not a[cmask]:
        print("Invalid:", img, cmask)
    if a[cmask]:
        res.loc[len(res)] = {"img_path" : img, "mask_path" : cmask}


/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_AUG/IMGS_CROP_SP_78P9F51B37.jpg /content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_AUG/MASKS_CROP_SP_78P9F51B37.jpg 0
/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_IMGS_AUG/IMGS_CROP_SP_VYHN05AGL9.jpg /content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/HAM10000_MASKS_AUG/MASKS_CROP_SP_VYHN05AGL9.jpg 0


In [None]:
res.to_csv("/content/drive/MyDrive/Granados_Thesis_SP24/HAM10000/image_indexing.csv", index = True, chunksize = 2500)