# Create .npy Files from IDC Images

In [25]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pylab as plt
from progressbar import ProgressBar
from PIL import Image
import random
%matplotlib inline

## Import Images

In [26]:
import os

folders = []
files = []

for entry in os.scandir('Desktop/IDC_regular_ps50_idx5'):
    if entry.is_dir():
        folders.append(entry.path)
    elif entry.is_file():
        files.append(entry.path)

# print('Folders:')
# print(folders[0:10])
print('Number of patients:')
print(len(folders))

Number of patients:
279


Create Files with path extensions /0 and /1 to each patient folder name

In [27]:
folder_extensions_0 = []
folder_extensions_1 = []


def add_extensions(folder_names):
    """
    Formats folder names to include 0 for IDC- and 1 for IDC+ images
    """
    for folder in folder_names:
        with_0 = folder + '/0'
        with_1 = folder + '/1'
        folder_extensions_0.append(with_0)
        folder_extensions_1.append(with_1)
    return folder_extensions_0, folder_extensions_1


add_extensions(folders)

(['Desktop/IDC_regular_ps50_idx5/9036/0',
  'Desktop/IDC_regular_ps50_idx5/10268/0',
  'Desktop/IDC_regular_ps50_idx5/10257/0',
  'Desktop/IDC_regular_ps50_idx5/8913/0',
  'Desktop/IDC_regular_ps50_idx5/13613/0',
  'Desktop/IDC_regular_ps50_idx5/8914/0',
  'Desktop/IDC_regular_ps50_idx5/15510/0',
  'Desktop/IDC_regular_ps50_idx5/10259/0',
  'Desktop/IDC_regular_ps50_idx5/16165/0',
  'Desktop/IDC_regular_ps50_idx5/10292/0',
  'Desktop/IDC_regular_ps50_idx5/12951/0',
  'Desktop/IDC_regular_ps50_idx5/10261/0',
  'Desktop/IDC_regular_ps50_idx5/10295/0',
  'Desktop/IDC_regular_ps50_idx5/9259/0',
  'Desktop/IDC_regular_ps50_idx5/12750/0',
  'Desktop/IDC_regular_ps50_idx5/13020/0',
  'Desktop/IDC_regular_ps50_idx5/16552/0',
  'Desktop/IDC_regular_ps50_idx5/12905/0',
  'Desktop/IDC_regular_ps50_idx5/9266/0',
  'Desktop/IDC_regular_ps50_idx5/16555/0',
  'Desktop/IDC_regular_ps50_idx5/13018/0',
  'Desktop/IDC_regular_ps50_idx5/9261/0',
  'Desktop/IDC_regular_ps50_idx5/9257/0',
  'Desktop/IDC_reg

In [28]:
import os

images_0 = []
images_1 = []


def pywalker_0(path):
    """
    Walks files and appends to IDC- list
    """
    for root, dirs, files in os.walk(path):
        for file_ in files:
            images_0.append(os.path.join(root, file_))


def pywalker_1(path):
    """
    Wals files and appends to IDC+ list
    """
    for root, dirs, files in os.walk(path):
        for file_ in files:
            images_1.append(os.path.join(root, file_))


if __name__ == '__main__':
    for filename in folder_extensions_0:
        pywalker_0(filename)

if __name__ == '__main__':
    for filename in folder_extensions_1:
        pywalker_1(filename)

In [29]:
print(len(images_0))
print(len(images_1))

198738
78786


### Identifying how many benign images do not fit the 50x50 pixel requirement


In [30]:
# Change to apply to all images
sub = images_0[:1000]
how_many_screwups = 0
pbar = ProgressBar()

for x in pbar(range(100)):
    for i in range(len(sub)):
        temp_image = Image.open(sub[i])
        temp = list(temp_image.getdata())
        temp_array = np.array(temp)
        if temp_array.shape != (2500, 3):
            how_many_screwups = how_many_screwups + 1
print(how_many_screwups)

100% (100 of 100) |######################| Elapsed Time: 0:03:31 Time:  0:03:31


5500


### Identifying how many malignant images do not fit the 50x50 pixel requirement

In [33]:
# Change to apply to all images 
sub2 = images_1[:1000]
how_many_screwups2 = 0
pbar2 = ProgressBar()

for x in pbar2(range(100)):
    for i in range(len(sub2)):
        temp_image = Image.open(sub2[i])
        temp = list(temp_image.getdata())
        temp_array = np.array(temp)
        if temp_array.shape != (2500, 3):
            how_many_screwups2 += 1
print(how_many_screwups2)

100% (100 of 100) |######################| Elapsed Time: 0:03:08 Time:  0:03:08


0


# For Color Images

### Create 2d Array of all 50x50 benign images

In [34]:
images_0_data = []
benign = images_0[:]
for i in range(len(benign)):
    temp_image = Image.open(benign[i])
    temp = list(temp_image.getdata())
    temp_array = np.array(temp)
    if temp_array.shape == (2500, 3):
        data_array = temp_array.reshape(50, 50, 3)
        data = data_array/255
        images_0_data.append(data)
print('Done!')

Done!


In [35]:
np.save('benign_rgb', images_0_data)
print('Done!')

Done!


In [36]:
benign_subset = random.sample(images_0_data, 5000)
np.save('benign_subset_rgb', benign_subset)

In [39]:
benign_subset10k = random.sample(images_0_data, 10000)
np.save('benign_subset_rgb10k', benign_subset10k)

### Create 2d Array of all 50x50 malignant images

In [40]:
images_1_data = []
malignant = images_1[:]
for i in range(len(malignant)):
    temp_image = Image.open(malignant[i])
    temp = list(temp_image.getdata())
    temp_array = np.array(temp)
    if temp_array.shape == (2500, 3):
        data_array = temp_array.reshape(50, 50, 3)
        data = data_array/255
        images_1_data.append(data)
print('Done!')

Done!


In [41]:
np.save('malignant_rgb', images_1_data)

In [42]:
malignant_subset = random.sample(images_1_data, 5000)
np.save('malignant_subset_rgb', malignant_subset)

In [44]:
malignant_subset20k = random.sample(images_1_data, 10000)
np.save('malignant_subset_rgb10k', malignant_subset20k)

## For Black and White Images

In [45]:
images_0_bw = []
benign = images_0[:]
for i in range(len(benign)):
    temp_image = Image.open(benign[i])
    temp_grey = temp_image.convert('L')
    temp = list(temp_grey.getdata())
    temp_array = np.array(temp)
    if temp_array.shape == (2500,):
        data_array = temp_array.reshape(50, 50, 1)
        data = data_array/255
        images_0_bw.append(data)
print('Done!')

Done!


In [46]:
np.save('benign_grey', images_0_bw)
print('Done!')

Done!


In [47]:
benign_subset_grey = random.sample(images_0_bw, 10000)
np.save('benign_subset_grey', benign_subset_grey)

In [48]:
images_1_bw = []
malignant = images_1[:]
for i in range(len(malignant)):
    temp_image = Image.open(malignant[i])
    temp_grey = temp_image.convert('L')
    temp = list(temp_grey.getdata())
    temp_array = np.array(temp)
    if temp_array.shape == (2500,):
        data_array = temp_array.reshape(50, 50, 1)
        data = data_array/255
        images_1_bw.append(data)
print('Done!')

Done!


In [49]:
np.save('malignant_grey', images_1_bw)

In [50]:
malignant_subset_grey = random.sample(images_1_bw, 10000)
np.save('malignant_subset_grey', malignant_subset_grey)