In [1]:
#Import Relevant Data Science Modules
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
#Data Augmentation and Formating Steps
def random_crop(img, newdim ,width):
    crop = np.random.randint(0,2,1)[0]
    if crop == 1:
        assert img.shape[0] >= width
        assert img.shape[1] >= width
        x = np.random.randint(0, img.shape[1] - width)
        y = np.random.randint(0, img.shape[0] - width)
        img = img[y:y+width, x:x+width]
        img = cv2.resize(img, dsize=(newdim, newdim), interpolation = cv2.INTER_NEAREST)
    else:
        img = cv2.resize(img, dsize=(newdim, newdim), interpolation = cv2.INTER_NEAREST)
    return img

def image_process(images,newdim,augment):
    #First Clean the Dataframes
    rows = images.shape[0]
    crop_width = np.random.randint(30,150,rows)
    image_final = np.zeros(shape=(rows,newdim,newdim,3))
    for i in range(rows):
        if augment == True:
            x = random_crop(images[i],newdim,crop_width[i])
            x = np.array(x).reshape(newdim,newdim,1)
            x = x.astype('float32') / 255
        else: 
            x = cv2.resize(images[i], dsize=(newdim, newdim), interpolation = cv2.INTER_NEAREST)
            x = np.array(x).reshape(newdim,newdim,1)
            x = x.astype('float32') / 255
        image_final[i] = cv2.cvtColor(x, cv2.COLOR_GRAY2RGB)
    return image_final

def data_process(filenames,path2dat,newdim,augment):
    for file in filenames:
        if 'images' in vars():
            print("Processing:" + file)
            x = np.load(path2dat + file)
            x = image_process(x,newdim,augment)
            images = np.vstack((images,x))
        else:
            print("Processing:" + file)
            images = np.load(path2dat + file)
            images = image_process(images,newdim,augment)
        print("Current Size:" + str(images.shape))
    return images

def labels_process(labelnames,path2dat):
    for file in labelnames:
        if 'labels' in vars():
            print("Processing:" + file)
            x = np.load(path2dat + file)
            labels = np.concatenate((labels,x))
        else:
            print("Processing:" + file)
            labels = np.load(path2dat + file)
        print("Current Size:" + str(labels.shape))
    return labels

def join_val_train(images,labels,path2dat,newdim):
    im1 = np.load(path2dat + images[0])
    im2 = np.load(path2dat + images[1])
    im = np.vstack((im1,im2))    
    del im1, im2
    
    lab1 = np.load(path2dat + labels[0])
    lab2 = np.load(path2dat + labels[1])
    lab = np.concatenate((lab1,lab2))
    lab[lab > 0] = 1
    del lab1, lab2
    
    im = image_process(im,newdim,augment=False)
    
    return im, lab

In [6]:
#Process Train, Validation Data
path2dat = "C:/Users/alex/Hands-On-Machine Learning/Project - Mammography/Data/"
path2out = path2dat + "/Processed/"
vt_im_files = ["cv10_data.npy","test10_data.npy"]
vt_lab_files = ["cv10_labels.npy","test10_labels.npy"]

vt_im, vt_lab = join_val_train(vt_im_files,vt_lab_files,path2dat,newdim=64)
validation_images, test_images, validation_labels, test_labels = train_test_split(vt_im, vt_lab, test_size=0.5,stratify=vt_lab)
np.save(path2out + "Validation_Images.npy",validation_images)
np.save(path2out + "Validation_Labels.npy",validation_labels)
np.save(path2out + "Test_Images.npy",test_images)
np.save(path2out + "Test_Labels.npy",test_labels)
del validation_images, test_images, validation_labels, test_labels,vt_im,vt_lab

In [3]:
#Process Raw Images
path2dat = "C:/Users/alex/Hands-On-Machine Learning/Project - Mammography/Data/"
path2out = path2dat + "/Processed/"
filenames = ["train_img_raw_0.npy","train_img_raw_1.npy","train_img_raw_2.npy","train_img_raw_3.npy","train_img_raw_4.npy"]
train_images = data_process(filenames,path2dat,64,False)
np.save(path2out + "Train_Images0.npy",train_images)
del train_images

Processing:train_img_raw_0.npy
Current Size:(11177, 64, 64, 3)
Processing:train_img_raw_1.npy
Current Size:(22354, 64, 64, 3)
Processing:train_img_raw_2.npy
Current Size:(33531, 64, 64, 3)
Processing:train_img_raw_3.npy
Current Size:(44708, 64, 64, 3)
Processing:train_img_raw_4.npy
Current Size:(55885, 64, 64, 3)


In [4]:
#Process Training Labels (Raw)
labelnames = ["train_lab_raw_0.npy","train_lab_raw_1.npy","train_lab_raw_2.npy","train_lab_raw_3.npy","train_lab_raw_4.npy"]
train_labels = labels_process(labelnames,path2dat)
np.save(path2out + "Train_Labels0.npy",train_labels)
del train_labels

Processing:train_lab_raw_0.npy
Current Size:(11177,)
Processing:train_lab_raw_1.npy
Current Size:(22354,)
Processing:train_lab_raw_2.npy
Current Size:(33531,)
Processing:train_lab_raw_3.npy
Current Size:(44708,)
Processing:train_lab_raw_4.npy
Current Size:(55885,)


In [7]:
#Process Upscaled Images
filenames = ["train_img_aug_0.npy","train_img_aug_1.npy","train_img_aug_2.npy","train_img_aug_3.npy","train_img_aug_4.npy"]
train_images = data_process(filenames,path2dat,64,False)
np.save(path2out + "Train_Images1.npy",train_images)
del train_images

Processing:train_img_aug_0.npy
Current Size:(18572, 64, 64, 3)
Processing:train_img_aug_1.npy
Current Size:(36879, 64, 64, 3)
Processing:train_img_aug_2.npy
Current Size:(55276, 64, 64, 3)
Processing:train_img_aug_3.npy
Current Size:(74008, 64, 64, 3)
Processing:train_img_aug_4.npy
Current Size:(92405, 64, 64, 3)


In [9]:
#Process Training Labels (Generic)
labelnames = ["train_lab_aug_0.npy","train_lab_aug_1.npy","train_lab_aug_2.npy","train_lab_aug_3.npy","train_lab_aug_4.npy"]
train_labels = labels_process(labelnames,path2dat)
np.save(path2out + "Train_Labels.npy",train_labels)
del train_labels

Processing:train_lab_aug_0.npy
Current Size:(18572,)
Processing:train_lab_aug_1.npy
Current Size:(36879,)
Processing:train_lab_aug_2.npy
Current Size:(55276,)
Processing:train_lab_aug_3.npy
Current Size:(74008,)
Processing:train_lab_aug_4.npy
Current Size:(92405,)


In [12]:
#Process Upscaled Images + Random Cropping
train_images = data_process(filenames,path2dat,64,True)
np.save(path2out + "Train_Images2.npy",train_images)
del train_images

Processing:train_img_aug_0.npy
Current Size:(18572, 64, 64, 3)
Processing:train_img_aug_1.npy
Current Size:(36879, 64, 64, 3)
Processing:train_img_aug_2.npy
Current Size:(55276, 64, 64, 3)
Processing:train_img_aug_3.npy
Current Size:(74008, 64, 64, 3)
Processing:train_img_aug_4.npy
Current Size:(92405, 64, 64, 3)
