In [27]:
#Parts of notebook inspired by: https://www.kaggle.com/linjustin/cnn-keras

#Imports
#%%
import os
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import tensorflow  as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import (Dense,Conv2D, Dropout, Activation, Flatten, MaxPooling2D, BatchNormalization)
np.random.seed(987123)
#%%

In [28]:
#Dataset already split into training test and validation sets
DATA_PATH = './tcga_coad_msi_mss'
training_set_dir = os.path.join(DATA_PATH, 'train')
training_root = pathlib.Path(training_set_dir)

test_set_dir = os.path.join(DATA_PATH, 'test')
test_root = pathlib.Path(test_set_dir)

validation_set_dir = os.path.join(DATA_PATH,'val')
validation_root = pathlib.Path(validation_set_dir)

train_MSIMUT_dir = os.path.join(training_set_dir, 'MSIMUT')
train_MSS_dir = os.path.join(training_set_dir, 'MSS')

test_MSIMUT_dir = os.path.join(test_set_dir, 'MSIMUT')
test_MSS_dir = os.path.join(test_set_dir, 'MSS')

validation_MSIMUT_dir = os.path.join(validation_set_dir, 'MSIMUT')
validation_MSS_dir = os.path.join(validation_set_dir, 'MSS')

In [29]:
#Files in root training folder - same folder structure in every set is the same.
for image in training_root.glob("*"):
 print(image.name)

MSIMUT
MSS


In [30]:
# Amount of images in each set

#os.listdir() used to get the list of all files and directories in the specified directory
#https://www.geeksforgeeks.org/python-os-listdir-method/

training_set_size_MSIMUT, training_set_size_MSS = len(os.listdir(train_MSIMUT_dir)), len(os.listdir(train_MSS_dir))
test_set_size_MSIMUT, test_set_size_MSS = len(os.listdir(test_MSIMUT_dir)), len(os.listdir(test_MSS_dir))
validation_set_size_MSIMUT, validation_set_size_MSS = len(os.listdir(validation_MSIMUT_dir)), len(os.listdir(validation_MSS_dir))

print("Amount of MSIMUT images in training set: ", training_set_size_MSIMUT)
print("Amount of MSS images in training set: ", training_set_size_MSS)

print("Amount of MSIMUT images in training set: ", test_set_size_MSIMUT)
print("Amount of MSS images in training set: ", test_set_size_MSS)

print("Amount of MSIMUT images in validation set: ", validation_set_size_MSIMUT)
print("Amount of MSS images in validation set: ", validation_set_size_MSS)

Amount of MSIMUT images in training set:  60031
Amount of MSS images in training set:  93818
Amount of MSIMUT images in training set:  7505
Amount of MSS images in training set:  11728
Amount of MSIMUT images in validation set:  7503
Amount of MSS images in validation set:  11727


In [31]:
#A list of files returned as strings inside of both the MSIMUT and MSS folders.
training_ds_list = tf.data.Dataset.list_files(str(training_root/'*/*'))

for image in training_ds_list.take(10):
    print(image.numpy())

b'tcga_coad_msi_mss\\train\\MSIMUT\\blk-QDTECMYTEINN-TCGA-NH-A5IV-01Z-00-DX1.jpg'
b'tcga_coad_msi_mss\\train\\MSS\\blk-WCSKTTQGYTHG-TCGA-F5-6813-01Z-00-DX1.jpg'
b'tcga_coad_msi_mss\\train\\MSIMUT\\blk-FCAPEQLDQHQY-TCGA-A6-5661-01Z-00-DX1.jpg'
b'tcga_coad_msi_mss\\train\\MSS\\blk-NHEHPHDTRNKT-TCGA-AA-A02O-01Z-00-DX1.jpg'
b'tcga_coad_msi_mss\\train\\MSIMUT\\blk-GEFRCPNKTQYF-TCGA-D5-6530-01Z-00-DX1.jpg'
b'tcga_coad_msi_mss\\train\\MSIMUT\\blk-SWACLQWHEADM-TCGA-CM-4746-01Z-00-DX1.jpg'
b'tcga_coad_msi_mss\\train\\MSS\\blk-YLWNRDITRKHE-TCGA-DC-5869-01Z-00-DX1.jpg'
b'tcga_coad_msi_mss\\train\\MSS\\blk-YFFYRFSMFSSG-TCGA-NH-A50U-01Z-00-DX1.jpg'
b'tcga_coad_msi_mss\\train\\MSS\\blk-WCEGHYRMDGMP-TCGA-DM-A1D6-01Z-00-DX1.jpg'
b'tcga_coad_msi_mss\\train\\MSIMUT\\blk-CEWINHLGNCKQ-TCGA-EI-6882-01Z-00-DX1.jpg'


In [32]:
#Pipeline: extracting the label and image pairs. https://www.tensorflow.org/guide/data#consuming_sets_of_files
#labels are based on the folder names, so all images in the MSS folder will have the label MSS and the same for MSIMUT
def process_file_path(file_path):
    label = tf.strings.split(file_path, os.sep)[-2]
    return tf.io.read_file(file_path), label

training_ds_labeled = training_ds_list.map(process_file_path)

In [33]:
#displaying an image displaying raw binary data and the label. 
for image_raw, label_text in training_ds_labeled.take(1):
    print(repr(image_raw.numpy()[:100]))
    print()
    print(label_text.numpy())


b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xdb\x00C\x00\x03\x02\x02\x03\x02\x02\x03\x03\x03\x03\x04\x03\x03\x04\x05\x08\x05\x05\x04\x04\x05\n\x07\x07\x06\x08\x0c\n\x0c\x0c\x0b\n\x0b\x0b\r\x0e\x12\x10\r\x0e\x11\x0e\x0b\x0b\x10\x16\x10\x11\x13\x14\x15\x15\x15\x0c\x0f\x17\x18\x16\x14\x18\x12\x14\x15\x14\xff\xdb\x00C\x01\x03\x04\x04\x05\x04\x05'

b'MSIMUT'


In [37]:
plt.figure(figsize=(10,10))
for i in training_ds_labeled.take(1):
    plt.subplot(3,3,i+1)
    plt.imshow(i)
    plt.colorbar()
    plt.grid(False)
plt.show()

TypeError: can only concatenate tuple (not "int") to tuple

<Figure size 720x720 with 0 Axes>