# Load and Clean Images

## Libraries to Import

In [None]:
import os
from os import path
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf

## Creating Train/Test/Val Splits

Our testing set was not split into normal and cancer directories. Only a csv with labels and file names was provided. So, created lists for positive and negative class file names.

In [None]:
all_df_labels = pd.read_csv('/Users/arencarpenter/Desktop/testing/C-NMC_test_prelim_phase_data_labels.csv')

In [None]:
pos_images = all_df_labels[all_df_labels.labels == 1]
pos_image_list = list(pos_images.new_names)

In [None]:
neg_images = all_df_labels[all_df_labels.labels == 0]
neg_image_list = list(neg_images.new_names)

Use Shutil to move positive files to correct directory.

In [None]:
src = "/Users/arencarpenter/Desktop/testing/C-NMC_test_prelim_phase_data"
dst = "/Users/arencarpenter/Desktop/testing/all"

files = [i for i in os.listdir(src) if i in pos_image_list and path.isfile(path.join(src, i))]
for f in files:
    shutil.move(path.join(src, f), dst)

Use Shutil to move negative files to correct directory.

In [None]:
src = "/Users/arencarpenter/Desktop/testing/C-NMC_test_prelim_phase_data"
dst = "/Users/arencarpenter/Desktop/testing/normal"

files = [i for i in os.listdir(src) if i in neg_image_list and path.isfile(path.join(src, i))]
for f in files:
    shutil.move(path.join(src, f), dst)

Data was split into subdirectories, so moved negative subdirectories into the superdirectory.

In [None]:
source = "/Users/arencarpenter/Desktop/training/fold_2/hem/"
destination = "/Users/arencarpenter/Desktop/training/normal/"

files = [i for i in os.listdir(source)]
for f in files:
    shutil.move(path.join(source, f), destination)

Now we can work on creating our validation sets.

In [None]:
all_train_dir = '/Users/arencarpenter/Desktop/Detecting_ALL_with_CNN/Data/training/all/'
normal_train_dir = '/Users/arencarpenter/Desktop/Detecting_ALL_with_CNN/Data/training/normal/'
validation_dir = '/Users/arencarpenter/Desktop/Detecting_ALL_with_CNN/Data/validation/'

See how many training images we have for each class so we can split into validation set.

In [None]:
all_train_imgs = [file for file in os.listdir(all_train_dir)]
normal_train_imgs = [file for file in os.listdir(normal_train_dir)]

In [None]:
print("Number of ALL Train Images: ", len(all_train_imgs))
print("Number of Normal Train Images: ", len(normal_train_imgs))

Make a new validation folder and subfolders for 'ALL' and 'normal'.

In [None]:
os.mkdir(validation_dir)

In [None]:
all_val_dir = os.path.join(validation_dir, 'all')
normal_val_dir = os.path.join(validation_dir, 'normal')

In [None]:
os.mkdir(all_val_dir)
os.mkdir(normal_val_dir)

Put 25% of the training images for each class into the validation set for the respective class.

In [None]:
imgs = all_train_imgs[:1818]
for img in imgs:
    origin = os.path.join(all_train_dir, img)
    destination = os.path.join(all_val_dir, img)
    shutil.move(origin, destination)

In [None]:
imgs = normal_train_imgs[:847]
for img in imgs:
    origin = os.path.join(normal_train_dir, img)
    destination = os.path.join(normal_val_dir, img)
    shutil.move(origin, destination)

Here is the final count for each set (Train/Validation/Test) and each class (ALL and normal).

In [None]:
print('Train ALL', len(os.listdir('/Users/arencarpenter/Desktop/Detecting_ALL_with_CNN/Data/training/all/')))
print('Train Normal', len(os.listdir('/Users/arencarpenter/Desktop/Detecting_ALL_with_CNN/Data/training/normal/')), '\n')

print('Val ALL', len(os.listdir('/Users/arencarpenter/Desktop/Detecting_ALL_with_CNN/Data/validation/all/')))
print('Val Normal', len(os.listdir('/Users/arencarpenter/Desktop/Detecting_ALL_with_CNN/Data/validation/normal/')), '\n')

print('Test ALL', len(os.listdir('/Users/arencarpenter/Desktop/Detecting_ALL_with_CNN/Data/testing/all/')))
print('Test Normal', len(os.listdir('/Users/arencarpenter/Desktop/Detecting_ALL_with_CNN/Data/testing/normal/')))