Import required modules

In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import json
import patient_data
import cnn
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Conv2D, MaxPool2D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy


Access the folder path for the cancer and the non-cancer images

In [2]:
all_paths = json.loads(open("./paths.json").read())

personal_path = all_paths['personal_path']
non_cancerous_path = personal_path + all_paths['non_cancerous_path']
cancerous_path = personal_path + all_paths['cancerous_path']

Load in all the DICOM files and preprocess/label images

In [3]:
# Using the patient_data data structure, load in all the patient data and save it in a dictionary with the folder name as the key
def load_all_patients(path, add_label = False):
    patients = {}
    folder = os.listdir(path)
    for name in folder:
        patients[name] = patient_data.Patient(os.path.join(path, name))
        if add_label:
            if patients[name].segpath == None:
                print(name, "was not processed correctly")
            else:
                patients[name].label_imgs()
    return patients

# nc_patients = load_all_patients(non_cancerous_path)
c_patients = load_all_patients(cancerous_path, True)


Setting up train/test data

In [4]:
# # not sure if we need this
# # create a list for the merged data
# x = []
# y = []

# create a list for only the cancerous dataset data
x_c = []
y_c = []
# # create a list for only the non-cancerous dataset data
# x_nc = []
# y_nc = []

for patient in c_patients.values():
    for i, img in enumerate(patient.ct.data.values()):
        x_c.append(img)
        y_c.append(patient.labels[i])
        # # not sure if we need this
        # x.append(img)
        # y.append(patient.labels[i])

x_c, y_c = shuffle(x_c, y_c)
# for patient in nc_patients.values():
#     for i, img in enumerate(patient.ct.images):
#         x_nc.append(img)
#         y_nc.append(patient.labels[i])
#         # # not sure if we need this
#         # x.append(img)
#         # y.append(patient.labels[i])

# # not sure if we need this
# # Shuffle the merged data
# combined = list(zip(x, y))
# np.random.shuffle(combined)
# x2, y2 = zip(*combined)

def generate_train_test():
    # to ensure equal distribution of non-cancer to cancer data, split the data before merging it
    x_train, x_test, y_train, y_test = train_test_split(x_c, y_c, test_size=0.2, random_state=42)

    # x_train_add, x_test_add, y_train_add, y_test_add = train_test_split(x_nc, y_nc, test_size=0.2, random_state=42)
    # x_train.extend(x_train_add) 
    # x_test.extend(x_test_add) 
    # y_train.extend(y_train_add) 
    # y_test.extend(y_test_add) 

    # Convert lists to arrays
    x_train = np.array(x_train)/255
    x_test = np.array(x_test)/255
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    return x_train, x_test, y_train, y_test, x_train[0]

x_train, x_test, y_train, y_test, test = generate_train_test()

In [5]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print('Num GPUs Available: ', len(physical_devices))
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

Num GPUs Available:  0


In [10]:
model = Sequential([
    Conv2D(filters=32, kernel_size=(3,3), activation='relu', padding='same', input_shape=(512,512,1)),
    MaxPool2D(pool_size=(2,2), strides=2),
    Conv2D(filters=64, kernel_size=(3,3), activation='relu', padding='same'),
    MaxPool2D(pool_size=(2,2), strides=2),
    Flatten(),
    Dense(units=1, activation='sigmoid')
])

In [12]:
model.summary()
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [13]:
model.fit(
    x=x_train,
    y=y_train,
    batch_size=10,
    epochs=20,
    verbose=2
)

Epoch 1/20
28/28 - 35s - 1s/step - accuracy: 0.8225 - loss: 0.4241
Epoch 2/20
28/28 - 32s - 1s/step - accuracy: 0.8442 - loss: 0.3350
Epoch 3/20
28/28 - 33s - 1s/step - accuracy: 0.8804 - loss: 0.2627
Epoch 4/20
28/28 - 33s - 1s/step - accuracy: 0.9167 - loss: 0.1853
Epoch 5/20
28/28 - 36s - 1s/step - accuracy: 0.9493 - loss: 0.1474
Epoch 6/20
28/28 - 35s - 1s/step - accuracy: 0.9601 - loss: 0.1251
Epoch 7/20
28/28 - 36s - 1s/step - accuracy: 0.9746 - loss: 0.0996
Epoch 8/20
28/28 - 39s - 1s/step - accuracy: 0.9891 - loss: 0.0725
Epoch 9/20
28/28 - 38s - 1s/step - accuracy: 0.9638 - loss: 0.0751
Epoch 10/20
28/28 - 40s - 1s/step - accuracy: 0.9928 - loss: 0.0483
Epoch 11/20
28/28 - 38s - 1s/step - accuracy: 0.9855 - loss: 0.0491
Epoch 12/20
28/28 - 35s - 1s/step - accuracy: 0.9891 - loss: 0.0369
Epoch 13/20
28/28 - 42s - 1s/step - accuracy: 1.0000 - loss: 0.0262
Epoch 14/20
28/28 - 38s - 1s/step - accuracy: 0.9928 - loss: 0.0347
Epoch 15/20
28/28 - 35s - 1s/step - accuracy: 0.9746 - lo

<keras.src.callbacks.history.History at 0x24089c75610>

In [None]:
predictions = (model.predict(x_test) > 0.5).astype("int32")


In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        61
           1       1.00      0.89      0.94         9

    accuracy                           0.99        70
   macro avg       0.99      0.94      0.97        70
weighted avg       0.99      0.99      0.99        70



In [21]:
for i, j in enumerate(y_test):
    k = predictions[i][0]
    if j != k:
        print(j, k)

1 0


Train and test CNN model

In [None]:

# num_tests = 1
# cnns = []
# for i in range(num_tests):
# cnns.append(cnn.CNN(x_train, x_test, y_train, y_test))

Cross validation and bootstrapping

In [None]:
# print(cnns[0].test_acc)