Import required modules

In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import json
import patient_data
import cnn
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Conv2D, MaxPool2D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy


2024-11-29 05:56:44.968870: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-29 05:56:44.985052: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732859805.003999   63991 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732859805.009724   63991 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-29 05:56:45.029756: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Access the folder path for the cancer and the non-cancer images

In [2]:
all_paths = json.loads(open("./paths.json").read())

personal_path = all_paths['personal_path']
non_cancerous_path = personal_path + all_paths['non_cancerous_path']
cancerous_path = personal_path + all_paths['cancerous_path']

Load in all the DICOM files and preprocess/label images

In [3]:
# Using the patient_data data structure, load in all the patient data and save it in a dictionary with the folder name as the key
def load_all_patients(path, add_label = False):
    patients = {}
    folder = os.listdir(path)
    for name in folder:
        patients[name] = patient_data.Patient(os.path.join(path, name))
        if add_label:
            if patients[name].segpath == None:
                print(name, "was not processed correctly")
            else:
                patients[name].label_imgs()
    return patients

# nc_patients = load_all_patients(non_cancerous_path)
c_patients = load_all_patients(cancerous_path, True)


Setting up train/test data

In [4]:
# # not sure if we need this
# # create a list for the merged data
# x = []
# y = []

# create a list for only the cancerous dataset data
x_c = []
y_c = []
# # create a list for only the non-cancerous dataset data
# x_nc = []
# y_nc = []

for patient in c_patients.values():
    for i, img in enumerate(patient.ct.data.values()):
        x_c.append(img)
        y_c.append(patient.labels[i])
        # # not sure if we need this
        # x.append(img)
        # y.append(patient.labels[i])

x_c, y_c = shuffle(x_c, y_c)
c_patients = None

# for patient in nc_patients.values():
#     for i, img in enumerate(patient.ct.images):
#         x_nc.append(img)
#         y_nc.append(patient.labels[i])
#         # # not sure if we need this
#         # x.append(img)
#         # y.append(patient.labels[i])

# # not sure if we need this
# # Shuffle the merged data
# combined = list(zip(x, y))
# np.random.shuffle(combined)
# x2, y2 = zip(*combined)

def generate_train_test():
    # to ensure equal distribution of non-cancer to cancer data, split the data before merging it
    x_train, x_test, y_train, y_test = train_test_split(x_c, y_c, test_size=0.2, random_state=42)

    # x_train_add, x_test_add, y_train_add, y_test_add = train_test_split(x_nc, y_nc, test_size=0.2, random_state=42)
    # x_train.extend(x_train_add) 
    # x_test.extend(x_test_add) 
    # y_train.extend(y_train_add) 
    # y_test.extend(y_test_add) 

    # Convert lists to arrays
    x_train = np.array(x_train)/255
    x_test = np.array(x_test)/255
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    return x_train, x_test, y_train, y_test, x_train[0]

x_train, x_test, y_train, y_test, test = generate_train_test()

In [5]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print('Num GPUs Available: ', len(physical_devices))
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

Num GPUs Available:  1


In [6]:
model = Sequential([
    Conv2D(filters=32, kernel_size=(3,3), activation='relu', padding='same', input_shape=(512,512,1)),
    MaxPool2D(pool_size=(2,2), strides=2),
    Conv2D(filters=64, kernel_size=(3,3), activation='relu', padding='same'),
    MaxPool2D(pool_size=(2,2), strides=2),
    Flatten(),
    Dense(units=1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1732859997.162589   63991 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 43598 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:ca:00.0, compute capability: 8.6


In [7]:
model.summary()
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [8]:
model.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_test, y_test),
    batch_size=10,
    epochs=20,
    verbose=1
)

Epoch 1/20


I0000 00:00:1732860042.741068   64734 service.cc:148] XLA service 0x7f93540098a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732860042.741937   64734 service.cc:156]   StreamExecutor device (0): NVIDIA A40, Compute Capability 8.6
2024-11-29 06:00:42.850848: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1732860043.050635   64734 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m   7/3021[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:05[0m 22ms/step - accuracy: 0.5966 - loss: 0.6111

I0000 00:00:1732860046.347139   64734 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 27ms/step - accuracy: 0.8646 - loss: 0.3156 - val_accuracy: 0.9027 - val_loss: 0.2160
Epoch 2/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 22ms/step - accuracy: 0.9208 - loss: 0.1839 - val_accuracy: 0.9366 - val_loss: 0.1529
Epoch 3/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 23ms/step - accuracy: 0.9491 - loss: 0.1258 - val_accuracy: 0.9486 - val_loss: 0.1247
Epoch 4/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 23ms/step - accuracy: 0.9668 - loss: 0.0889 - val_accuracy: 0.9519 - val_loss: 0.1228
Epoch 5/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 23ms/step - accuracy: 0.9766 - loss: 0.0655 - val_accuracy: 0.9619 - val_loss: 0.0937
Epoch 6/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 23ms/step - accuracy: 0.9843 - loss: 0.0486 - val_accuracy: 0.9674 - val_loss: 0.0814
Epoch 7/20
[1m

<keras.src.callbacks.history.History at 0x7faae0221280>

In [9]:
predictions = (model.predict(x_test) > 0.5).astype("int32")


[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step


In [10]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      6438
           1       0.96      0.87      0.91      1113

    accuracy                           0.98      7551
   macro avg       0.97      0.93      0.95      7551
weighted avg       0.98      0.98      0.98      7551



In [14]:
print(len(y_test))
for i, j in enumerate(y_test):
    k = predictions[i][0]
    if j != k:
        print(j, k)

7551
1 0
1 0
0 1
1 0
0 1
1 0
1 0
1 0
0 1
0 1
0 1
0 1
1 0
1 0
0 1
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
0 1
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
0 1
1 0
1 0
0 1
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
0 1
0 1
0 1
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
0 1
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
0 1


Train and test CNN model

In [12]:

# num_tests = 1
# cnns = []
# for i in range(num_tests):
# cnns.append(cnn.CNN(x_train, x_test, y_train, y_test))

Cross validation and bootstrapping

In [13]:
# print(cnns[0].test_acc)