In [1]:
import re
import numpy as np
import pandas as pd
import os
import PIL
import random
import shutil
import matplotlib.pyplot as plt
import PIL
import sys
import cv2
from tqdm import tqdm
from PIL import Image, ImageMath

from skimage.transform import rescale, resize, downscale_local_mean
from img_processing_256 import mask_img, rename
%matplotlib inline

In [2]:
def random_rotate_image_train(img):
    rotations = np.random.randint(low=-3, high=3)
    return np.rot90(img, rotations)

In [3]:
def random_flip_img_train(img):
    fliplr = np.random.binomial(1,0.5)
    flipud = np.random.binomial(1,0.5)
    
    if fliplr:
        img = np.flip(img, 1)
    if flipud:
        img = np.flip(img, 0)
        
    return random_rotate_image_train(img)

In [4]:
def crop_img(img):
    slice_size=512
    tile_size=256
    img_h = img.shape[0]
    img_w = img.shape[1]
    
    # make sure the image is big enough to use
    if (img_h < slice_size) or (img_w < slice_size):
        print("Error - image is wrong size!", img.shape)
        return np.array([0])
    
    # pick a random place to start the crop so that the crop will be the right size
    start_row = np.random.randint(low=0, high=(img_h - slice_size))
    start_col = np.random.randint(low=0, high=(img_w - slice_size))
    
    end_row = start_row + slice_size
    end_col = start_col + slice_size
    
    # crop the image and randomly rotate it
    cropped_img = random_flip_img_train(img[start_row:end_row, start_col:end_col])
    
    # make sure the image is the right size
    if cropped_img.shape[0] == cropped_img.shape[1]:
        # resize it and return it
        cropped_img = cropped_img.astype('float32')
        cropped_img = cv2.resize(cropped_img, dsize=(tile_size, tile_size), interpolation=cv2.INTER_CUBIC)
        return cropped_img.reshape((tile_size, tile_size, 1))
    
    # else repeat until the image is the right size
    else:
        return crop_img(img)

In [5]:
def create_patches(mask_dir, img_dir, Lbls, size=256, debug=True):
    patch_list = []
    Lbl_list = []
    FN_list = []
    roi_sizes = []
    full_size = 512
    masks = os.listdir(mask_dir)
    counter = 0
    if debug is None:
        progress(counter, len(masks), 'WORKING')
    for mask in tqdm(masks):
        counter += 1
        if debug is None:
            progress(counter, len(masks), mask)    
            
        base_img_file = mask[:-6] + ".png"
        full_img = PIL.Image.open(img_dir+"/"+base_img_file)
        compare = base_img_file[14:]
        
        try:
            Lbl = Lbls.loc[compare]['Class']
        except:
            print("Error LabelNotFound", base_img_file)
            continue
        
        full_img_arr = np.array(full_img)[:,:]
        ctr_row, ctr_col, too_big, full_img_arr, mask_size = mask_img(mask_dir + "/" + mask,full_img_arr, half=False,
                                                                         output=debug)
        img_h, img_w = full_img_arr.shape
        try:
            mask_H = mask_size[0]
            mask_W = mask_size[1]
            roi_size = np.max([mask_H, mask_W])
            if debug:
                print("Mask", mask, " Height:", mask_H, "Width:", mask_W)
        except:
            print("Mask Size Error:", mask_size, "for", mask)
        # Record roi size for DDSM image crop
        roi_sizes.append(roi_size)
        if (ctr_row == 0) and (ctr_col == 0):
            print("Error, skipping", mask)
            continue
        """
        Extract the ROI depending on it's size
        If the ROI is smaller than a slice extract it with some padding
        """
        if roi_size < full_size:
            if debug:
                print("ROI small", mask)
            ## Make sure the size of the ROI is at least as big as a tile will be
            adj_mask_H = int(np.max([full_size * 1.4, mask_H]))
            adj_mask_W = int(np.max([full_size * 1.4, mask_W]))
            ## Extract the full ROI with 20% padding on either side
            start_row = int(np.max([ctr_row - (adj_mask_H // 2), 0]))
            end_row = start_row + adj_mask_H
            if end_row > img_h:
                end_row = img_h
                start_row = img_h - adj_mask_H
            start_col = int(np.max([ctr_col - (adj_mask_W // 2), 0]))
            end_col = start_col + adj_mask_W
            if end_col > img_w:
                end_col = img_w
                start_col = img_w - adj_mask_W

            # extract the ROI and randomly flip it
            roi_img = random_flip_img_train(full_img_arr[start_row:end_row, start_col:end_col])
        # else extract the ROI with less padding
        else:
            if debug:
                print("ROI Big", mask)
            # padding for the random cropping
            adj_mask_H = int(np.max([full_size * 1.15, mask_H]))
            adj_mask_W = int(np.max([full_size * 1.15, mask_W]))
            start_row = np.max([ctr_row - (adj_mask_H // 2), 0])
            end_row = start_row + adj_mask_H
            if end_row > img_h:
                end_row = img_h
                start_row = img_h - adj_mask_H
            start_col = np.max([ctr_col - (adj_mask_W // 2), 0])
            end_col = start_col + adj_mask_W
            if end_col > img_w:
                end_col = img_w
                start_col = img_w - adj_mask_W
            # extract the ROI and randomly flip it
            roi_img = random_flip_img_train(full_img_arr[start_row:end_row, start_col:end_col])
              
        patch_1 = crop_img(roi_img)
        patch_2 = crop_img(roi_img)
        patch_3 = crop_img(roi_img)
         
        if (patch_1.shape[0] == size) and (patch_1.shape[1] == size):
            patch_list.append(patch_1)
            Lbl_list.append(Lbl)
            FN_list.append(base_img_file)
                
        if (patch_2.shape[0] == size) and (patch_2.shape[1] == size):
            patch_list.append(patch_2)
            Lbl_list.append(Lbl)
            FN_list.append(base_img_file)
        
        if (patch_3.shape[0] == size) and (patch_2.shape[1] == size):
            patch_list.append(patch_3)
            Lbl_list.append(Lbl)
            FN_list.append(base_img_file)
                
    return np.array(patch_list), np.array(Lbl_list), np.array(FN_list), roi_sizes

In [6]:
train_labels = pd.read_pickle("label old ver/train_label.pkl")
train_labels['IMAGE_NAME2'] = train_labels.index
train_labels = train_labels.drop_duplicates(['IMAGE_NAME2'])
## use a copy on the local drive to make testing faster
mask_dir = "D:/mammography/mask/train_png/Calc"
img_dir = "D:/mammography/full/train_png/Calc"

train_calc_patch, train_calc_Lbl, train_calc_FN, train_calc_roi_size = \
        create_patches(mask_dir, img_dir, Lbls=train_labels, debug=True)

  0%|                                                                                 | 1/1546 [00:00<14:48,  1.74it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00005_RIGHT_CC_1.png
Mask Calc-Training_P_00005_RIGHT_CC_1.png  Height: 549 Width: 637
ROI Big Calc-Training_P_00005_RIGHT_CC_1.png


  0%|                                                                                 | 2/1546 [00:01<14:35,  1.76it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00005_RIGHT_MLO_1.png
Mask Calc-Training_P_00005_RIGHT_MLO_1.png  Height: 509 Width: 641
ROI Big Calc-Training_P_00005_RIGHT_MLO_1.png


  0%|▏                                                                                | 3/1546 [00:01<14:36,  1.76it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00007_LEFT_CC_1.png
Mask Calc-Training_P_00007_LEFT_CC_1.png  Height: 641 Width: 537
ROI Big Calc-Training_P_00007_LEFT_CC_1.png


  0%|▏                                                                                | 4/1546 [00:02<14:41,  1.75it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00007_LEFT_MLO_1.png
Mask Calc-Training_P_00007_LEFT_MLO_1.png  Height: 649 Width: 721
ROI Big Calc-Training_P_00007_LEFT_MLO_1.png


  0%|▎                                                                                | 5/1546 [00:02<13:51,  1.85it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_LEFT_CC_1.png
Mask Calc-Training_P_00008_LEFT_CC_1.png  Height: 121 Width: 137
ROI small Calc-Training_P_00008_LEFT_CC_1.png


  0%|▎                                                                                | 6/1546 [00:03<13:04,  1.96it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_LEFT_CC_2.png
Mask Calc-Training_P_00008_LEFT_CC_2.png  Height: 129 Width: 137
ROI small Calc-Training_P_00008_LEFT_CC_2.png


  0%|▎                                                                                | 7/1546 [00:03<12:27,  2.06it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_LEFT_CC_3.png
Mask Calc-Training_P_00008_LEFT_CC_3.png  Height: 73 Width: 105
ROI small Calc-Training_P_00008_LEFT_CC_3.png


  1%|▍                                                                                | 8/1546 [00:04<12:34,  2.04it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_LEFT_MLO_1.png
Mask Calc-Training_P_00008_LEFT_MLO_1.png  Height: 89 Width: 97
ROI small Calc-Training_P_00008_LEFT_MLO_1.png


  1%|▍                                                                                | 9/1546 [00:04<12:13,  2.10it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_LEFT_MLO_2.png
Mask Calc-Training_P_00008_LEFT_MLO_2.png  Height: 105 Width: 89
ROI small Calc-Training_P_00008_LEFT_MLO_2.png


  1%|▌                                                                               | 10/1546 [00:04<11:49,  2.17it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_LEFT_MLO_3.png
Mask Calc-Training_P_00008_LEFT_MLO_3.png  Height: 81 Width: 105
ROI small Calc-Training_P_00008_LEFT_MLO_3.png


  1%|▌                                                                               | 11/1546 [00:05<12:11,  2.10it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_RIGHT_CC_1.png
Mask Calc-Training_P_00008_RIGHT_CC_1.png  Height: 233 Width: 209
ROI small Calc-Training_P_00008_RIGHT_CC_1.png


  1%|▌                                                                               | 12/1546 [00:05<11:50,  2.16it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_RIGHT_CC_2.png
Mask Calc-Training_P_00008_RIGHT_CC_2.png  Height: 105 Width: 129
ROI small Calc-Training_P_00008_RIGHT_CC_2.png


  1%|▋                                                                               | 13/1546 [00:06<11:27,  2.23it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_RIGHT_CC_3.png
Mask Calc-Training_P_00008_RIGHT_CC_3.png  Height: 113 Width: 97
ROI small Calc-Training_P_00008_RIGHT_CC_3.png


  1%|▋                                                                               | 14/1546 [00:06<11:15,  2.27it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_RIGHT_CC_4.png
Mask Calc-Training_P_00008_RIGHT_CC_4.png  Height: 73 Width: 89
ROI small Calc-Training_P_00008_RIGHT_CC_4.png


  1%|▊                                                                               | 15/1546 [00:07<11:04,  2.30it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_RIGHT_CC_5.png
Mask Calc-Training_P_00008_RIGHT_CC_5.png  Height: 89 Width: 121
ROI small Calc-Training_P_00008_RIGHT_CC_5.png


  1%|▊                                                                               | 16/1546 [00:07<11:58,  2.13it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_RIGHT_MLO_1.png
Mask Calc-Training_P_00008_RIGHT_MLO_1.png  Height: 241 Width: 113
ROI small Calc-Training_P_00008_RIGHT_MLO_1.png


  1%|▉                                                                               | 17/1546 [00:08<11:39,  2.19it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_RIGHT_MLO_2.png
Mask Calc-Training_P_00008_RIGHT_MLO_2.png  Height: 65 Width: 57
ROI small Calc-Training_P_00008_RIGHT_MLO_2.png


  1%|▉                                                                               | 18/1546 [00:08<11:28,  2.22it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_RIGHT_MLO_3.png
Mask Calc-Training_P_00008_RIGHT_MLO_3.png  Height: 73 Width: 81
ROI small Calc-Training_P_00008_RIGHT_MLO_3.png


  1%|▉                                                                               | 19/1546 [00:09<11:18,  2.25it/s]

Trimming borders D:/mammography/mask/train_png/Calc/Calc-Training_P_00008_RIGHT_MLO_4.png
Mask Calc-Training_P_00008_RIGHT_MLO_4.png  Height: 97 Width: 97
ROI small Calc-Training_P_00008_RIGHT_MLO_4.png


  1%|▉                                                                               | 19/1546 [00:09<12:33,  2.03it/s]


IndexError: too many indices for array

In [None]:
print("Train calc patches shape:", train_calc_patch.shape)
print("Train calc Labels:", len(train_calc_Lbl))
print("Train calc File Name:", len(train_calc_FN))

In [None]:
print("ROI Mean Size:", np.round(np.mean(train_calc_roi_size),2))
print("ROI Min Size:", np.min(train_calc_roi_size))
print("ROI Max Size:", np.max(train_calc_roi_size))
print("ROI Size Std:", np.round(np.std(train_calc_roi_size),2))

In [None]:
np.save(os.path.join("Processed_abnorm_256", "train_calc_patch.npy"), train_calc_patch)
np.save(os.path.join("Processed_abnorm_256", "train_calc_Lbl.npy"), np.array(train_calc_Lbl))
np.save(os.path.join("Processed_abnorm_256", "train_calc_FN.npy"), train_calc_FN)
np.save(os.path.join("Processed_abnorm_256", "train_calc_roi_size.npy"), np.array(train_calc_roi_size))

In [None]:
test_labels = pd.read_pickle("label old ver/test_label.pkl")
test_labels['IMAGE_NAME2'] = test_labels.index
#test_labels = test_labels.drop_duplicates(['IMAGE_NAME2'])
test_labels

In [None]:
#Mass_training
train_labels = pd.read_pickle("label old ver/train_label.pkl")
train_labels['IMAGE_NAME2'] = train_labels.index
train_labels = train_labels.drop_duplicates(['IMAGE_NAME2'])

## use a copy on the local drive to make testing faster
mask_dir = "D:/mammography/mask/train_png/Mass"
img_dir = "D:/mammography/full/train_png/Mass"

train_mass_patch, train_mass_Lbl, train_mass_FN, train_mass_roi_size = \
    create_patches(mask_dir, img_dir, Lbls=train_labels, debug=True)

In [None]:
print("ROI Mean Size:", np.mean(train_mass_roi_size))
print("ROI Min Size:", np.min(train_mass_roi_size))
print("ROI Max Size:", np.max(train_mass_roi_size))
print("ROI Size Std:", np.std(train_mass_roi_size))

In [None]:
np.save(os.path.join("Processed_abnorm_256", "train_mass_patch.npy"), train_mass_patch)
np.save(os.path.join("Processed_abnorm_256", "train_mass_Lbl.npy"), np.array(train_mass_Lbl))
np.save(os.path.join("Processed_abnorm_256", "train_mass_FN.npy"), train_mass_FN)
np.save(os.path.join("Processed_abnorm_256", "train_mass_roi_size.npy"), np.array(train_mass_roi_size))

In [None]:
def create_patches(mask_dir, img_dir, Lbls, size=256, debug=True):
    patch_list = []
    Lbl_list = []
    FN_list = []
    roi_sizes = []
    full_size = 512
    masks = os.listdir(mask_dir)
    counter = 0
    if debug is None:
        progress(counter, len(masks), 'WORKING')
    for mask in tqdm(masks):
        counter += 1
        if debug is None:
            progress(counter, len(masks), mask)    
            
        base_img_file = mask[:-6] + ".png"
        full_img = PIL.Image.open(img_dir+"/"+base_img_file)
        compare = base_img_file[10:]
        
        try:
            Lbl = Lbls.loc[compare]['Class']
        except:
            print("Error LabelNotFound", base_img_file)
            continue
        
        full_img_arr = np.array(full_img)[:,:]
        ctr_row, ctr_col, too_big, full_img_arr, mask_size = mask_img(mask_dir + "/" + mask,full_img_arr, half=False,
                                                                         output=debug)
        img_h, img_w = full_img_arr.shape
        try:
            mask_H = mask_size[0]
            mask_W = mask_size[1]
            roi_size = np.max([mask_H, mask_W])
            if debug:
                print("Mask", mask, " Height:", mask_H, "Width:", mask_W)
        except:
            print("Mask Size Error:", mask_size, "for", mask)
        # Record roi size for DDSM image crop
        roi_sizes.append(roi_size)
        if (ctr_row == 0) and (ctr_col == 0):
            print("Error, skipping", mask)
            continue
        """
        Extract the ROI depending on it's size
        If the ROI is smaller than a slice extract it with some padding
        """
        if roi_size < full_size:
            if debug:
                print("ROI small", mask)
            ## Make sure the size of the ROI is at least as big as a tile will be
            adj_mask_H = int(np.max([full_size * 1.4, mask_H]))
            adj_mask_W = int(np.max([full_size * 1.4, mask_W]))
            ## Extract the full ROI with 20% padding on either side
            start_row = int(np.max([ctr_row - (adj_mask_H // 2), 0]))
            end_row = start_row + adj_mask_H
            if end_row > img_h:
                end_row = img_h
                start_row = img_h - adj_mask_H
            start_col = int(np.max([ctr_col - (adj_mask_W // 2), 0]))
            end_col = start_col + adj_mask_W
            if end_col > img_w:
                end_col = img_w
                start_col = img_w - adj_mask_W

            # extract the ROI and randomly flip it
            roi_img = random_flip_img_train(full_img_arr[start_row:end_row, start_col:end_col])
        # else extract the ROI with less padding
        else:
            if debug:
                print("ROI Big", mask)
            # padding for the random cropping
            adj_mask_H = int(np.max([full_size * 1.15, mask_H]))
            adj_mask_W = int(np.max([full_size * 1.15, mask_W]))
            start_row = np.max([ctr_row - (adj_mask_H // 2), 0])
            end_row = start_row + adj_mask_H
            if end_row > img_h:
                end_row = img_h
                start_row = img_h - adj_mask_H
            start_col = np.max([ctr_col - (adj_mask_W // 2), 0])
            end_col = start_col + adj_mask_W
            if end_col > img_w:
                end_col = img_w
                start_col = img_w - adj_mask_W
            # extract the ROI and randomly flip it
            roi_img = random_flip_img_train(full_img_arr[start_row:end_row, start_col:end_col])
              
        patch_1 = crop_img(roi_img)
        patch_2 = crop_img(roi_img)
        patch_3 = crop_img(roi_img)
         
        if (patch_1.shape[0] == size) and (patch_1.shape[1] == size):
            patch_list.append(patch_1)
            Lbl_list.append(Lbl)
            FN_list.append(base_img_file)
                
        if (patch_2.shape[0] == size) and (patch_2.shape[1] == size):
            patch_list.append(patch_2)
            Lbl_list.append(Lbl)
            FN_list.append(base_img_file)
        
        if (patch_3.shape[0] == size) and (patch_2.shape[1] == size):
            patch_list.append(patch_3)
            Lbl_list.append(Lbl)
            FN_list.append(base_img_file)
                
    return np.array(patch_list), np.array(Lbl_list), np.array(FN_list), roi_sizes

In [None]:
#Calc_test
test_labels = pd.read_pickle("label old ver/test_label.pkl")
test_labels['IMAGE_NAME2'] = test_labels.index
test_labels = test_labels.drop_duplicates(['IMAGE_NAME2'])

## use a copy on the local drive to make testing faster
mask_dir = "D:/mammography/mask/test_png/Calc"
img_dir = "D:/mammography/full/test_png/Calc"

test_calc_patch, test_calc_Lbl, test_calc_FN, test_calc_roi_size = \
    create_patches(mask_dir, img_dir, Lbls=test_labels, debug=True)

In [None]:
print("Test calc patches shape:", test_calc_patch.shape)
print("Test calc Labels:", len(test_calc_Lbl))
print("Test calc File Name:", len(test_calc_FN))

In [None]:
print("ROI Mean Size:", np.round(np.mean(test_calc_roi_size),2))
print("ROI Min Size:", np.min(test_calc_roi_size))
print("ROI Max Size:", np.max(test_calc_roi_size))
print("ROI Size Std:", np.round(np.std(test_calc_roi_size),2))

In [None]:
np.save(os.path.join("Processed_abnorm_256", "test_calc_patch.npy"), test_calc_patch)
np.save(os.path.join("Processed_abnorm_256", "test_calc_Lbl.npy"), np.array(test_calc_Lbl))
np.save(os.path.join("Processed_abnorm_256", "test_calc_FN.npy"), test_calc_FN)
np.save(os.path.join("Processed_abnorm_256", "test_calc_roi_size.npy"), np.array(test_calc_roi_size))

In [None]:
#Mass_test
test_labels = pd.read_pickle("label old ver/test_label.pkl")
test_labels['IMAGE_NAME2'] = test_labels.index
test_labels = test_labels.drop_duplicates(['IMAGE_NAME2'])

## use a copy on the local drive to make testing faster
mask_dir = "D:/mammography/mask/test_png/Mass"
img_dir = "D:/mammography/full/test_png/Mass"

test_mass_patch, test_mass_Lbl, test_mass_FN, test_mass_roi_size = \
    create_patches(mask_dir, img_dir, Lbls=test_labels, debug=True)

In [None]:
print("test mass patches shape:", test_mass_patch.shape)
print("test mass Labels:", len(test_mass_Lbl))
print("test mass File Name:", len(test_mass_FN))

In [None]:
print("ROI Mean Size:", np.round(np.mean(test_mass_roi_size),2))
print("ROI Min Size:", np.min(test_mass_roi_size))
print("ROI Max Size:", np.max(test_mass_roi_size))
print("ROI Size Std:", np.round(np.std(test_mass_roi_size),2))

In [None]:
np.save(os.path.join("Processed_abnorm_256", "test_mass_patch.npy"), test_mass_patch)
np.save(os.path.join("Processed_abnorm_256", "test_mass_Lbl.npy"), np.array(test_mass_Lbl))
np.save(os.path.join("Processed_abnorm_256", "test_mass_FN.npy"), test_mass_FN)
np.save(os.path.join("Processed_abnorm_256", "test_mass_roi_size.npy"), np.array(test_mass_roi_size))

In [None]:
train_patch = np.concatenate([train_mass_patch, train_calc_patch], axis=0)
train_Lbl = np.concatenate([train_mass_Lbl, train_calc_Lbl], axis=0)
train_FN = np.concatenate([train_mass_FN, train_calc_FN], axis=0)

test_patch = np.concatenate([test_mass_patch, test_calc_patch], axis=0)
test_Lbl = np.concatenate([test_mass_Lbl, test_calc_Lbl], axis=0)
test_FN = np.concatenate([test_mass_FN, test_calc_FN], axis=0)

In [None]:
print("Train Patches:", train_patch.shape)
print("Train Lables:", train_Lbl.shape)
print("Train File Names:", train_FN.shape)

print("Test Patches:", test_patch.shape)
print("Test Lables:", test_Lbl.shape)
print("Test File Names:", test_FN.shape)

In [None]:
np.save(os.path.join("Processed_abnorm_256", "abnormal_train_patch.npy"), train_patch)
np.save(os.path.join("Processed_abnorm_256", "abnormal_train_Lbl.npy"), train_Lbl)
np.save(os.path.join("Processed_abnorm_256", "abnormal_train_FN.npy"), train_FN)

np.save(os.path.join("Processed_abnorm_256", "abnormal_test_patch.npy"), test_patch)
np.save(os.path.join("Processed_abnorm_256", "abnormal_test_Lbl.npy"), test_Lbl)
np.save(os.path.join("Processed_abnorm_256", "abnormal_test_FN.npy"), test_FN)

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
#train data
CBIS_train_patches = np.load(os.path.join("./Processed_abnorm_256", "abnormal_train_patch.npy" ))
CBIS_train_labels = np.load(os.path.join("./Processed_abnorm_256", "abnormal_train_Lbl.npy" ))
CBIS_train_FNs = np.load(os.path.join("./Processed_abnorm_256", "abnormal_train_FN.npy" ))

#test data
CBIS_test_patches = np.load(os.path.join("./Processed_abnorm_256", "abnormal_test_patch.npy" ))
CBIS_test_labels = np.load(os.path.join("./Processed_abnorm_256", "abnormal_test_Lbl.npy" ))
CBIS_test_FNs = np.load(os.path.join("./Processed_abnorm_256", "abnormal_test_FN.npy" ))



print("Abnaormal train Patches:", CBIS_train_patches.shape)
print("Abnaormal train Labels:", CBIS_train_labels.shape)
print("Abnaormal train File Names:", CBIS_train_FNs.shape)
print("\n")
print("Abnaormal test Patches:", CBIS_test_patches.shape)
print("Abnaormal test Labels:", CBIS_test_labels.shape)
print("Abnaormal test File Names:", CBIS_test_FNs.shape)

In [None]:
print(CBIS_test_labels)

In [None]:
MALIGNANT_mass=0
MALIGNANT_calcification=0
BENIGN_calcification = 0
BENIGN_mass = 0

for name in CBIS_test_labels:
    if(name == 'MALIGNANT_mass'):
        MALIGNANT_mass += 1
    elif(name == 'MALIGNANT_calcification'):
        MALIGNANT_calcification += 1
    elif(name == 'BENIGN_calcification'):
        BENIGN_calcification += 1
    elif(name == 'BENIGN_mass'):
        BENIGN_mass += 1
print(MALIGNANT_mass)
print(MALIGNANT_calcification)
print(BENIGN_calcification)
print(BENIGN_mass)

In [None]:
N = 20
idx = random.sample(range(len(CBIS_train_patches)), k=N)
plt.figure(figsize=(15,15))
for i, j in enumerate(idx):
    plt.subplot(5,4,i+1)
    plt.imshow(CBIS_train_patches[j].reshape(256, 256), cmap='gist_heat')
    plt.title(CBIS_train_labels[j] + "\n" + CBIS_train_FNs[j] + str(j))
    plt.tight_layout()
plt.show()

In [None]:
#combine train and test data 

CBIS_all_patches = np.concatenate([CBIS_train_patches, CBIS_test_patches], axis=0)
CBIS_all_labels = np.concatenate([CBIS_train_labels, CBIS_test_labels], axis=0)
CBIS_all_FNs = np.concatenate([CBIS_train_FNs, CBIS_test_FNs], axis=0)

CBIS_all_patches, CBIS_all_labels, CBIS_all_FNs = \
shuffle(CBIS_all_patches, CBIS_all_labels, CBIS_all_FNs, random_state=19510705)

In [None]:
#split the combined data into train and test
train_patches, test_patches, train_labels, test_labels, train_FNs, test_FNs = \
train_test_split(CBIS_all_patches, CBIS_all_labels, CBIS_all_FNs, test_size = 0.183565, random_state=19430727)

In [None]:
#train data
train_images = train_patches
train_labels = train_labels
train_FNs = train_FNs

#test data
test_images =  test_patches
test_labels = test_labels
test_FNs = test_FNs

In [None]:
le = preprocessing.LabelEncoder()
le.fit(train_labels)

In [None]:
list(le.classes_)

In [None]:
#Convert Normal to 0 
train_labels_en = le.transform(train_labels)
#train_labels_en[train_labels_en==]=0

test_labels_en = le.transform(test_labels)
#test_labels_en[test_labels_en==5]=0

In [None]:
train_bin_labels = np.zeros(len(train_labels_en)).astype(np.int32)
train_bin_labels[train_labels_en != 0] = 1

test_bin_labels = np.zeros(len(test_labels_en)).astype(np.int32)
test_bin_labels[test_labels_en != 0] = 1

In [None]:
np.unique(train_bin_labels)

In [None]:
np.unique(test_bin_labels)

In [None]:
np.save(os.path.join("Label", "train_labels_en.npy"), train_labels_en)
np.save(os.path.join("Label", "test_labels_en.npy"), test_labels_en)
np.save(os.path.join("Label", "train_bin_labels.npy"), train_bin_labels)
np.save(os.path.join("Label", "test_bin_labels.npy"), test_bin_labels)

In [None]:
X_val, X_test, y_val, y_test, y_val_multi, y_test_multi = \
    train_test_split(test_images, test_bin_labels, test_labels_en, test_size=0.5, random_state=19730104)
X_train, y_train, y_train_multi = \
     shuffle(train_images, train_bin_labels, train_labels_en, random_state=100)

In [None]:
X_test = test_images
y_test = test_labels_en

X_train = train_images
y_train = train_labels_en

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
np.save(os.path.join("Data/256", 'X_train.npy'), X_train)
np.save(os.path.join("Data/256", 'y_train.npy'), y_train)
np.save(os.path.join("Data/256", 'train_labels_multi.npy'), y_train_multi)

In [None]:
np.save(os.path.join("Data/256", 'X_val.npy'), X_val)
np.save(os.path.join("Data/256", 'y_val.npy'), y_val)
np.save(os.path.join("Data/256", 'y_val_labels_multi.npy'), y_val_multi)

In [None]:
np.save(os.path.join("Data/256", 'X_test.npy'), X_test)
np.save(os.path.join("Data/256", 'y_test.npy'), y_test)
np.save(os.path.join("Data/256", 'y_test_labels_multi.npy'), y_test_multi)