Import required modules

In [1]:
import gc
import json
import math
import numpy as np
import os
import patient_data
import tensorflow as tf

from sklearn.metrics import classification_report
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.image import ImageDataGenerator

2024-12-14 22:04:11.363769: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-14 22:04:11.380100: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734213851.398453   14089 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734213851.404016   14089 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-14 22:04:11.423282: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Access the folder path for the cancer and the non-cancer images

In [2]:
all_paths = json.loads(open("./paths.json").read())

personal_path = all_paths['personal_path']
cancerous_path = personal_path + all_paths['cancerous_path']

Configure GPUs is applicable

In [3]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print('Num GPUs Available: ', len(physical_devices))
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

Num GPUs Available:  1


Load in all the DICOM files and preprocess/label images

In [4]:
# Using the patient_data data structure, load in all the patient data and save it in a dictionary with the folder name as the key
def load_all_patients(path, add_label = False):
    patients = np.array([])
    folder = os.listdir(path)
    for name in folder:
        patients= np.append(patients, patient_data.Patient(os.path.join(path, name)))
        if patients[-1].segpath == None:
            print(name, "was not processed correctly")
            patients = patients[:-1]
        elif add_label:
            patients[-1].label_imgs()
    return patients

patients = load_all_patients(cancerous_path, True)

KeyboardInterrupt: 

In [None]:
num_nc = 0
num_c = 0
for i in patients:
    num_nc += sum(1 for j in i.labels if j ==0)   
    num_c += sum(1 for j in i.labels if j ==1)  

print("number of patients loaded:", len(patients))
print("number of non-cancerous images in this dataset:", num_nc)
print("number of cancerous images in this dataset:", num_c)

Setting up train/test data

In [None]:
# mix up the data
patients = shuffle(patients)

# Train-test split should be 80-20. 
# Since the data has been shuffled, we can just grab the 1st 80% of the list and make it the train set and the remainder is the test set
train_patients = patients[:math.floor(len(patients) * 0.8)]
test_patients = patients[math.floor(len(patients) * 0.8):]

# split the test patients up into images and labels
x_test = []
y_test = np.array([])
for p in test_patients:
    x_test.extend(y for y in p.ct.data.values())
    y_test = np.append(y_test, p.labels)
x_test = np.asarray(x_test)
x_test, y_test = shuffle(x_test, y_test)

# split the train patients up into images and labels
x_train, y_train = [], np.array([])
for patient in train_patients:
    x_train.extend(image for image in patient.ct.data.values())
    y_train = np.append(y_train, patient.labels)
x_train, y_train = shuffle(np.asarray(x_train), y_train)

del train_patients
del test_patients
del patients

Create a custom callback to clear any memory that is no longer being used

In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        gc.collect
        tf.keras.backend.clear_session()

In [None]:
# Define K-Fold Cross-Validation
n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_results = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)):
    print(f"\nTraining fold {fold + 1}/{n_splits}")
    
    # Load the ResNet50 model pre-trained on ImageNet
    model = ResNet50(weights='imagenet', include_top=False, input_tensor=Input(shape=(512, 512, 3)))
    
    # Custom layers
    flattened = tf.keras.layers.Flatten()(model.output)
    l2 = tf.keras.layers.Dense(128, activation='relu')(flattened)
    l3 = tf.keras.layers.Dense(1, activation='sigmoid')(l2)
    
    # Define the full model
    model = tf.keras.models.Model(inputs=model.input, outputs=l3)
    
    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.00001),  # Use a smaller learning rate for end-to-end training
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # Early stopping and learning rate scheduler
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(factor=0.5, patience=3)
    
    # Train the model directly using the training and validation data
    history = model.fit(
        x=x_train[train_idx],
        y=y_train[train_idx],
        validation_data=(x_train[val_idx], y_train[val_idx]),
        batch_size=32,
        epochs=50,
        callbacks=[early_stopping, lr_scheduler],
        verbose=1
    )
    
    # Evaluate the model
    predictions = (model.predict(x_test) > 0.5).astype("int32")
    report = classification_report(y_test, predictions, output_dict=True)
    print(classification_report(y_test, predictions))
    
    # Save fold results
    fold_results.append(report)
    K.clear_session()

# Aggregate results
avg_accuracy = np.mean([fold['accuracy'] for fold in fold_results])
print(f"\nAverage Accuracy Across {n_splits} Folds: {avg_accuracy:.4f}")


In [None]:
# Define K-Fold Cross-Validation
n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
# x_data = np.array(x_c)/255  # Normalize the images
# y_data = np.array(y_c)
# Model training and evaluation loop
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_patients, [0]*len(train_patients))):
    print(f"\nTraining fold {fold + 1}/{n_splits}")
    
    # Split data
    p_train, p_val = train_patients[train_idx], train_patients[val_idx]
    print('point a passed')

#     # possible model to test
#     model = Sequential([
#         Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding='same', input_shape=(512, 512, 1)),
#         MaxPool2D(pool_size=(2, 2), strides=2),
#         Conv2D(filters=128, kernel_size=(3, 3), activation='relu', padding='same'),
#         MaxPool2D(pool_size=(2, 2), strides=2),
#         Conv2D(filters=64, kernel_size=(1, 1), activation='relu', padding='same'),
#         MaxPool2D(pool_size=(2, 2), strides=1),
#         Flatten(),
#         Dense(units=1, activation='sigmoid')
#     ])
    # Build the model
    model = Sequential([
        Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding='same', input_shape=(512, 512, 1)),
        MaxPool2D(pool_size=(2, 2), strides=2),
        Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding='same'),
        MaxPool2D(pool_size=(2, 2), strides=2),
        Flatten(),
        Dense(units=1, activation='sigmoid')
    ])
    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    print('point b passed')

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    x_train = []
    y_train = np.array([])
    for p in p_train:
        x_train.extend(y for y in p.ct.data.values())
        y_train = np.append(y_train, p.labels)
    x_train, y_train = shuffle(x_train, y_train)
    x_train = np.asarray(x_train)

    x_val = []
    y_val = np.array([])
    for p in p_val:
        x_val.extend(y for y in p.ct.data.values())
        y_val = np.append(y_val, p.labels)
    x_val, y_val = shuffle(x_val, y_val)
    x_val = np.asarray(x_val)
    
    datagen = ImageDataGenerator()

    train_generator = datagen.flow(x_train, y_train, batch_size=16)
    val_generator = datagen.flow(x_val, y_val, batch_size=16)

    # Train the model
    model.fit(
        x=x_train,
        y=y_train,
        validation_data=(x_val, y_val),
        batch_size=32,
        epochs=20,
        callbacks=[MyCustomCallback()],
        verbose=1
    )
    print("Model architecture built")
    # Evaluate the model
    predictions = (model.predict(x_test) > 0.5).astype("int32")
    report = classification_report(y_test, predictions, output_dict=True)
    print(classification_report(y_test, predictions))
    
    # Save fold results
    fold_results.append(report)
    K.clear_session()

# Aggregate results
avg_accuracy = np.mean([fold['accuracy'] for fold in fold_results])
print(f"\nAverage Accuracy Across {n_splits} Folds: {avg_accuracy:.4f}")

In [None]:
print(len(y_test))
for i, j in enumerate(y_test):
    k = predictions[i][0]
    if j != k:
        print(j, k)

Train and test CNN model

In [None]:

# num_tests = 1
# cnns = []
# for i in range(num_tests):
# cnns.append(cnn.CNN(x_train, x_test, y_train, y_test))

Cross validation and bootstrapping

In [None]:
# print(cnns[0].test_acc)