Import required modules

In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import json
import patient_data
import cnn
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Conv2D, MaxPool2D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy


2024-11-30 18:27:51.554786: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-30 18:27:52.218259: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732991272.464488  124001 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732991272.532452  124001 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-30 18:27:53.159465: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Access the folder path for the cancer and the non-cancer images

In [2]:
all_paths = json.loads(open("./paths.json").read())

personal_path = all_paths['personal_path']
non_cancerous_path = personal_path + all_paths['non_cancerous_path']
cancerous_path = personal_path + all_paths['cancerous_path']

Load in all the DICOM files and preprocess/label images

In [3]:
# Using the patient_data data structure, load in all the patient data and save it in a dictionary with the folder name as the key
def load_all_patients(path, add_label = False):
    patients = {}
    folder = os.listdir(path)
    for name in folder:
        patients[name] = patient_data.Patient(os.path.join(path, name))
        if add_label:
            if patients[name].segpath == None:
                print(name, "was not processed correctly")
            else:
                patients[name].label_imgs()
    return patients

# nc_patients = load_all_patients(non_cancerous_path)
c_patients = load_all_patients(cancerous_path, True)


Setting up train/test data

In [4]:
# # not sure if we need this
# # create a list for the merged data
# x = []
# y = []

# create a list for only the cancerous dataset data
x_c = []
y_c = []
# # create a list for only the non-cancerous dataset data
# x_nc = []
# y_nc = []

for patient in c_patients.values():
    for i, img in enumerate(patient.ct.data.values()):
        x_c.append(img)
        y_c.append(patient.labels[i])
        # # not sure if we need this
        # x.append(img)
        # y.append(patient.labels[i])

x_c, y_c = shuffle(x_c, y_c)
c_patients = None

# for patient in nc_patients.values():
#     for i, img in enumerate(patient.ct.images):
#         x_nc.append(img)
#         y_nc.append(patient.labels[i])
#         # # not sure if we need this
#         # x.append(img)
#         # y.append(patient.labels[i])

# # not sure if we need this
# # Shuffle the merged data
# combined = list(zip(x, y))
# np.random.shuffle(combined)
# x2, y2 = zip(*combined)

# def generate_train_test():
#     # to ensure equal distribution of non-cancer to cancer data, split the data before merging it
#     x_train, x_test, y_train, y_test = train_test_split(x_c, y_c, test_size=0.2, random_state=42)

#     # x_train_add, x_test_add, y_train_add, y_test_add = train_test_split(x_nc, y_nc, test_size=0.2, random_state=42)
#     # x_train.extend(x_train_add) 
#     # x_test.extend(x_test_add) 
#     # y_train.extend(y_train_add) 
#     # y_test.extend(y_test_add) 

#     # Convert lists to arrays
#     x_train = np.array(x_train)/255
#     x_test = np.array(x_test)/255
#     y_train = np.array(y_train)
#     y_test = np.array(y_test)

#     return x_train, x_test, y_train, y_test, x_train[0]

# x_train, x_test, y_train, y_test, test = generate_train_test()

In [5]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print('Num GPUs Available: ', len(physical_devices))
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

Num GPUs Available:  1


In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
# Define K-Fold Cross-Validation
n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
x_data = np.array(x_c)/255  # Normalize the images
y_data = np.array(y_c)
# Model training and evaluation loop
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(x_c, y_c)):
    print(f"\nTraining fold {fold + 1}/{n_splits}")
    
    # Split data
    x_train, x_val = x_data[train_idx], x_data[val_idx]
    y_train, y_val = y_data[train_idx], y_data[val_idx]

    # Build the model
    model = Sequential([
        Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding='same', input_shape=(512, 512, 1)),
        MaxPool2D(pool_size=(2, 2), strides=2),
        Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding='same'),
        MaxPool2D(pool_size=(2, 2), strides=2),
        Flatten(),
        Dense(units=1, activation='sigmoid')
    ])
    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    # Train the model
    model.fit(
        x=x_train,
        y=y_train,
        validation_data=(x_val, y_val),
        batch_size=10,
        epochs=20,
        verbose=1
    )

    # Evaluate the model
    predictions = (model.predict(x_val) > 0.5).astype("int32")
    report = classification_report(y_val, predictions, output_dict=True)
    print(classification_report(y_val, predictions))
    
    # Save fold results
    fold_results.append(report)

# Aggregate results
avg_accuracy = np.mean([fold['accuracy'] for fold in fold_results])
print(f"\nAverage Accuracy Across {n_splits} Folds: {avg_accuracy:.4f}")


Training fold 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 25ms/step - accuracy: 0.8449 - loss: 6.3929 - val_accuracy: 0.9246 - val_loss: 0.1996
Epoch 2/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 23ms/step - accuracy: 0.9286 - loss: 0.1838 - val_accuracy: 0.9333 - val_loss: 0.1661
Epoch 3/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 23ms/step - accuracy: 0.9487 - loss: 0.1306 - val_accuracy: 0.9542 - val_loss: 0.1226
Epoch 4/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 24ms/step - accuracy: 0.9585 - loss: 0.1120 - val_accuracy: 0.9536 - val_loss: 0.1298
Epoch 5/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 24ms/step - accuracy: 0.9768 - loss: 0.0660 - val_accuracy: 0.9633 - val_loss: 0.1043
Epoch 6/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 23ms/step - accuracy: 0.9820 - loss: 0.0513 - val_accuracy: 0.9587 - val_loss: 0.1236
Epoc

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 28ms/step - accuracy: 0.8449 - loss: 6.8852 - val_accuracy: 0.9111 - val_loss: 0.2148
Epoch 2/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 23ms/step - accuracy: 0.9265 - loss: 0.1773 - val_accuracy: 0.9277 - val_loss: 0.1772
Epoch 3/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 24ms/step - accuracy: 0.9540 - loss: 0.1151 - val_accuracy: 0.9499 - val_loss: 0.1336
Epoch 4/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 24ms/step - accuracy: 0.9709 - loss: 0.0781 - val_accuracy: 0.9517 - val_loss: 0.1478
Epoch 5/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 23ms/step - accuracy: 0.9803 - loss: 0.0607 - val_accuracy: 0.9505 - val_loss: 0.1373
Epoch 6/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 24ms/step - accuracy: 0.9853 - loss: 0.0419 - val_accuracy: 0.9550 - val_loss: 0.1417
Epoc

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 26ms/step - accuracy: 0.8414 - loss: 9.7533 - val_accuracy: 0.9333 - val_loss: 0.1663
Epoch 2/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 24ms/step - accuracy: 0.9346 - loss: 0.1623 - val_accuracy: 0.9278 - val_loss: 0.1780
Epoch 3/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 24ms/step - accuracy: 0.9573 - loss: 0.1143 - val_accuracy: 0.9550 - val_loss: 0.1191
Epoch 4/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 24ms/step - accuracy: 0.9738 - loss: 0.0734 - val_accuracy: 0.9650 - val_loss: 0.0971
Epoch 5/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 24ms/step - accuracy: 0.9813 - loss: 0.0538 - val_accuracy: 0.9682 - val_loss: 0.0938
Epoch 6/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 24ms/step - accuracy: 0.9840 - loss: 0.0431 - val_accuracy: 0.9634 - val_loss: 0.1056
Epoc

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 25ms/step - accuracy: 0.8468 - loss: 5.4625 - val_accuracy: 0.9286 - val_loss: 0.1775
Epoch 2/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 23ms/step - accuracy: 0.9382 - loss: 0.1573 - val_accuracy: 0.9498 - val_loss: 0.1329
Epoch 3/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 24ms/step - accuracy: 0.9635 - loss: 0.0962 - val_accuracy: 0.9605 - val_loss: 0.1055
Epoch 4/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 24ms/step - accuracy: 0.9788 - loss: 0.0590 - val_accuracy: 0.9656 - val_loss: 0.1053
Epoch 5/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 24ms/step - accuracy: 0.9860 - loss: 0.0434 - val_accuracy: 0.9669 - val_loss: 0.1026
Epoch 6/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 24ms/step - accuracy: 0.9900 - loss: 0.0295 - val_accuracy: 0.9695 - val_loss: 0.0984
Epoc

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 25ms/step - accuracy: 0.8440 - loss: 6.4074 - val_accuracy: 0.9004 - val_loss: 0.2308
Epoch 2/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 24ms/step - accuracy: 0.9227 - loss: 0.1900 - val_accuracy: 0.8901 - val_loss: 0.2608
Epoch 3/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 23ms/step - accuracy: 0.9383 - loss: 0.1508 - val_accuracy: 0.9477 - val_loss: 0.1454
Epoch 4/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 24ms/step - accuracy: 0.9676 - loss: 0.0878 - val_accuracy: 0.9619 - val_loss: 0.1077
Epoch 5/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 23ms/step - accuracy: 0.9788 - loss: 0.0609 - val_accuracy: 0.9653 - val_loss: 0.1071
Epoch 6/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 24ms/step - accuracy: 0.9845 - loss: 0.0494 - val_accuracy: 0.9657 - val_loss: 0.0962
Epoc

In [14]:
    # Train the model
    model.fit(
        x=x_train,
        y=y_train,
        validation_data=(x_val, y_val),
        batch_size=10,
        epochs=20,
        verbose=1
    )

    # Evaluate the model
    predictions = (model.predict(x_val) > 0.5).astype("int32")
    report = classification_report(y_val, predictions, output_dict=True)
    print(classification_report(y_val, predictions))
    
    # Save fold results
    fold_results.append(report)

# Aggregate results
avg_accuracy = np.mean([fold['accuracy'] for fold in fold_results])
print(f"\nAverage Accuracy Across {n_splits} Folds: {avg_accuracy:.4f}")

Epoch 1/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 24ms/step - accuracy: 0.9984 - loss: 0.0053 - val_accuracy: 0.9748 - val_loss: 0.1350
Epoch 2/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 24ms/step - accuracy: 0.9972 - loss: 0.0093 - val_accuracy: 0.9721 - val_loss: 0.1504
Epoch 3/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 24ms/step - accuracy: 0.9980 - loss: 0.0078 - val_accuracy: 0.9742 - val_loss: 0.1395
Epoch 4/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 24ms/step - accuracy: 0.9982 - loss: 0.0054 - val_accuracy: 0.9746 - val_loss: 0.1434
Epoch 5/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 24ms/step - accuracy: 0.9986 - loss: 0.0057 - val_accuracy: 0.9750 - val_loss: 0.1338
Epoch 6/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 23ms/step - accuracy: 0.9991 - loss: 0.0037 - val_accuracy: 0.9750 - val_loss: 0.1422
Epoc

In [6]:
model = Sequential([
    Conv2D(filters=32, kernel_size=(3,3), activation='relu', padding='same', input_shape=(512,512,1)),
    MaxPool2D(pool_size=(2,2), strides=2),
    Conv2D(filters=64, kernel_size=(3,3), activation='relu', padding='same'),
    MaxPool2D(pool_size=(2,2), strides=2),
    Flatten(),
    Dense(units=1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1732859997.162589   63991 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 43598 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:ca:00.0, compute capability: 8.6


In [7]:
model.summary()
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [8]:
model.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_test, y_test),
    batch_size=10,
    epochs=20,
    verbose=1
)

Epoch 1/20


I0000 00:00:1732860042.741068   64734 service.cc:148] XLA service 0x7f93540098a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732860042.741937   64734 service.cc:156]   StreamExecutor device (0): NVIDIA A40, Compute Capability 8.6
2024-11-29 06:00:42.850848: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1732860043.050635   64734 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m   7/3021[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:05[0m 22ms/step - accuracy: 0.5966 - loss: 0.6111

I0000 00:00:1732860046.347139   64734 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 27ms/step - accuracy: 0.8646 - loss: 0.3156 - val_accuracy: 0.9027 - val_loss: 0.2160
Epoch 2/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 22ms/step - accuracy: 0.9208 - loss: 0.1839 - val_accuracy: 0.9366 - val_loss: 0.1529
Epoch 3/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 23ms/step - accuracy: 0.9491 - loss: 0.1258 - val_accuracy: 0.9486 - val_loss: 0.1247
Epoch 4/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 23ms/step - accuracy: 0.9668 - loss: 0.0889 - val_accuracy: 0.9519 - val_loss: 0.1228
Epoch 5/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 23ms/step - accuracy: 0.9766 - loss: 0.0655 - val_accuracy: 0.9619 - val_loss: 0.0937
Epoch 6/20
[1m3021/3021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 23ms/step - accuracy: 0.9843 - loss: 0.0486 - val_accuracy: 0.9674 - val_loss: 0.0814
Epoch 7/20
[1m

<keras.src.callbacks.history.History at 0x7faae0221280>

In [9]:
predictions = (model.predict(x_test) > 0.5).astype("int32")


[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step


In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

NameError: name 'y_test' is not defined

In [14]:
print(len(y_test))
for i, j in enumerate(y_test):
    k = predictions[i][0]
    if j != k:
        print(j, k)

7551
1 0
1 0
0 1
1 0
0 1
1 0
1 0
1 0
0 1
0 1
0 1
0 1
1 0
1 0
0 1
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
0 1
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
0 1
1 0
1 0
0 1
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
0 1
0 1
0 1
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
0 1
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
1 0
0 1
1 0
0 1


Train and test CNN model

In [12]:

# num_tests = 1
# cnns = []
# for i in range(num_tests):
# cnns.append(cnn.CNN(x_train, x_test, y_train, y_test))

Cross validation and bootstrapping

In [13]:
# print(cnns[0].test_acc)