Import required modules

In [1]:
import pydicom
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import nibabel as nib
from sklearn.model_selection import train_test_split
import pandas as pd


Access the folder path for the cancer and the non-cancer images

In [2]:
personal_path = "/Users/theon/Documents/Classes/Computing/CISC 471/Data"
non_cancerous_path = personal_path + "/Non-Cancerous-Images/abdominallymphnodes-26828"
non_cancerous_annotations_path = personal_path + "/Non-Cancerous-Images/Annotations-26828"
cancerous_path = personal_path + "/Cancerous-Images/NA-78735"
cancerous_annotations_path = personal_path + "/Cancerous-Images/Annotations-78735"


Load in all the DICOM files

In [3]:
def load_dicoms(folder_path):
    dicom_list = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".dcm"):
            path = os.path.join(folder_path, file_name)
            dicom_list.append(pydicom.dcmread(path))
    if dicom_list == []:
        raise ValueError("No DICOM files found in " + folder_path)
    return dicom_list

non_cancerous_ct_dicom = load_dicoms(non_cancerous_path)
non_cancerous_seg_dicom = load_dicoms(non_cancerous_annotations_path)
cancerous_ct_dicom = load_dicoms(cancerous_path)
cancerous_seg_dicom = load_dicoms(cancerous_annotations_path)

DICOM to JPG

In [4]:

def load_dicom_images(dicoms):
    images = []
    for dicom in dicoms:
        images.append(dicom.pixel_array)
    return images

non_cancerous_ct_images = load_dicom_images(non_cancerous_ct_dicom)
non_cancerous_seg_images = load_dicom_images(non_cancerous_seg_dicom)
cancerous_ct_images = load_dicom_images(cancerous_ct_dicom)
cancerous_seg_images = load_dicom_images(cancerous_seg_dicom)

In [5]:

print("Non-cancerous images loaded:", len(non_cancerous_ct_images))
print("Annotated non-cancerous images loaded:", len(non_cancerous_seg_images))
print("Cancerous images loaded:", len(cancerous_ct_images))
print("Annotated cancerous images loaded:", len(cancerous_seg_images))

Non-cancerous images loaded: 661
Annotated non-cancerous images loaded: 1
Cancerous images loaded: 134
Annotated cancerous images loaded: 1


In [6]:
def convert_to_jpg(images, output_folder):
    for i, img in enumerate(images):
        img_normalized = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX)
        img_uint8 = img_normalized.astype("uint8")
        cv2.imwrite(f"{output_folder}/image_{i}.jpg", img_uint8)


def convert_annotations_to_jpg(images, output_folder):
    for img in images:
        # grayscale the image
        img_normalized = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX)
        img_uint8 = img_normalized.astype("uint8")
        # since this is a 3d array, make a separate image for each layer in the 1st plane
        for i, j in enumerate(img_uint8):
            cv2.imwrite(f"{output_folder}/image_{i}.jpg", j)

convert_to_jpg(cancerous_ct_images, cancerous_path + "/images")
convert_to_jpg(non_cancerous_ct_images, non_cancerous_path + "/images")
convert_annotations_to_jpg(cancerous_seg_images, cancerous_annotations_path + "/images")
convert_annotations_to_jpg(non_cancerous_seg_images, non_cancerous_annotations_path + "/images")

Match CT and SEG Image Positions

In [7]:
def extract_ct_positions(ct_dicoms):
    ct_positions = {}
    for dicom in ct_dicoms:
        sop_instance_uid = dicom.SOPInstanceUID if hasattr(dicom, 'SOPInstanceUID') else None
        image_position = dicom.ImagePositionPatient if hasattr(dicom, 'ImagePositionPatient') else None
        if sop_instance_uid and image_position:
            ct_positions[sop_instance_uid] = image_position
    # return a dictionary of ct image uid : ct image position
    return ct_positions

def extract_segmentation_positions(seg_dicoms):
    # This function assumes there is only 1 segmentation dicom/only the 1st segmentation dicom matters
    segmentation_positions = []
    # Locate the first DICOM file in the segmentation directory
    dicom = seg_dicoms[0]  # Load the first segmentation file
    for i, frame in enumerate(dicom.PerFrameFunctionalGroupsSequence):
        sop_instance_uid = frame.DerivationImageSequence[0].SourceImageSequence[0].ReferencedSOPInstanceUID
        segmentation_positions.append((sop_instance_uid, i))
    # return a list of (referenced ct image uid, seg image position)
    return segmentation_positions

def match_positions(ct_positions, segmentation_positions):
    matched_positions = {}
    for (seg_uid, seg_frame) in segmentation_positions:
        if seg_uid in ct_positions:
            if not seg_uid in matched_positions:
                matched_positions[seg_uid] = {
                    "CT_Position": ct_positions[seg_uid],
                    "Segmentation_frames": [seg_frame]
                }
            else:
                matched_positions[seg_uid]["Segmentation_frames"].append(seg_frame)
    print(matched_positions)
    return matched_positions

# Extract positions
try:
    non_cancerous_positions = extract_ct_positions(non_cancerous_ct_dicom)
    non_cancerous_annotation_positions = extract_segmentation_positions(non_cancerous_seg_dicom)
    cancerous_positions = extract_ct_positions(cancerous_ct_dicom)
    cancerous_annotation_positions = extract_segmentation_positions(cancerous_seg_dicom)
except ValueError as e:
    print(e)

print(non_cancerous_annotation_positions)
# Match positions for non-cancerous and cancerous data
non_cancer_matched_positions = match_positions(non_cancerous_positions, non_cancerous_annotation_positions)
cancer_matched_positions = match_positions(cancerous_positions, cancerous_annotation_positions)

    
# Commented out print statements to reduce output

# # Check if data was extracted
# print("Non-cancerous CT Positions:", non_cancerous_positions)
# print("Non-cancerous Annotation Positions:", non_cancerous_annotation_positions)
# print("\nCancerous CT Positions:", cancerous_positions)
# print("Cancerous Annotation Positions:", cancerous_annotation_positions)

# # Check if there are matching SOP Instance UIDs for non-cancerous data
# print("\nNon-cancerous CT SOPInstanceUIDs:", list(non_cancerous_positions.keys()))
# print("Non-cancerous Segmentation SOPInstanceUIDs:", [uid for uid, _ in non_cancerous_annotation_positions])


# # Check if there are matching SOP Instance UIDs for cancerous data
# print("\nCancerous CT SOPInstanceUIDs:", list(cancerous_positions.keys()))
# print("Cancerous Segmentation SOPInstanceUIDs:", [uid for uid, _ in cancerous_annotation_positions])

# # Check matching data
# print(non_cancer_matched_positions)
# print(cancer_matched_positions)


[('61.7.225671413567587988709540514214837207016', 0), ('61.7.172515939084005252437456394015127410395', 1), ('61.7.311273956813205555334922523590492390726', 2), ('61.7.49239243940168297994111164026998337905', 3), ('61.7.46898277701274339226472024127606534162', 4), ('61.7.269707611950866927630519493863211594541', 5), ('61.7.62418700203484400719700631085943471490', 6), ('61.7.17033528246777316687879153860121593984', 7), ('61.7.172672028699375990388348736984503214630', 8), ('61.7.197257886739675122852389097900610672269', 9), ('61.7.39922166297242178900738436904830470529', 10), ('61.7.216694197039041528108241957020867795488', 11), ('61.7.282374800196595087812314209346209916245', 12), ('61.7.144413207071655021970347709406599477020', 13), ('61.7.219239145993323193706255940924546086765', 14), ('61.7.225671413567587988709540514214837207016', 15), ('61.7.172515939084005252437456394015127410395', 16), ('61.7.311273956813205555334922523590492390726', 17), ('61.7.49239243940168297994111164026998337

Overlay annotations on CT images and produce a JPEG for visualization

In [None]:
def annotate(image, annotation, color):
    img = image
    for x, row in enumerate(annotation):
        for y, col in enumerate(row):
            if col > 0:
                pixel_color = [color[0],color[1], color[2], col]
                img[y,x] = pixel_color
    return img

def overlay(ct_dicoms, ct_images, seg_images, matches, output_folder):
    # list of all images
    images = []
    match_found = 0
    # this function assumes there is only 1 segmentation dicom file/only the first segmentation dicom file matters
    img_normalized = cv2.normalize(seg_images[0], None, 0, 255, cv2.NORM_MINMAX)
    img_uint8 = img_normalized.astype("uint8")
    # itterate through the dicoms
    for i, ct_dicom in enumerate(ct_dicoms):
        # check if this layer should have an annotation
        ct_img = cv2.normalize(ct_images[i], None, 0, 255, cv2.NORM_MINMAX)
        images.append(ct_img.astype("uint8"))
        if ct_dicom.SOPInstanceUID in matches:
            # this fumction assumes the ct and annotations are in the correct order
            img = images[i]
            # for cts that have multiple annotations for one slide, provide a different color for each slide 
            color_options = [[255,0,0],[0,255,0],[0,0,255]]
            if len(img.shape) != 4:
                # the initial loaded images should all be grayscale and so will have 2 channels
                if len(img.shape) == 2:
                    img = cv2.merge([img, img, img, img], None)
                # if the image is not in grayscale or rgba, it should have 3 channels. if it has more or less it is an abnormal format we wont account for
                else:
                    print(len(img.shape))
                    r_channel, g_channel, b_channel = cv2.split(img)
                    if not r_channel or not g_channel or not b_channel:
                        raise ValueError("loaded ct has an abnormal number of channels")
                    img = cv2.merge([r_channel, g_channel, b_channel, 255])
            for j, frame in enumerate(matches[ct_dicom.SOPInstanceUID]["Segmentation_frames"]):
                color = color_options[j%len(color_options)]
                images[i] = annotate(img, img_uint8[frame], color)
                cv2.imwrite(f"{output_folder}/image_{i}.jpg", images[i])
            match_found +=1
    print(match_found)
    if match_found != len(matches):
        raise ValueError("there were " + len(matches) + "matches provided but only " + match_found + "matches found")
    return images

nc_output_folder = personal_path + "/annotated_non-cancerous_images"
c_output_folder = personal_path + "/annotated_cancerous_images"
non_cancerous_annotated_images = overlay(non_cancerous_ct_dicom, non_cancerous_ct_images, non_cancerous_seg_images, non_cancer_matched_positions, nc_output_folder)
cancerous_annotated_images = overlay(cancerous_ct_dicom, cancerous_ct_images, cancerous_seg_images, cancer_matched_positions, c_output_folder)

43
23


Preprocess Images for CNN

In [9]:
#Resize images
def preprocess_image(img, target_size=(224, 224)):
    img_resized = cv2.resize(img, target_size)
    return img_resized

non_cancerous_images_resized = [preprocess_image(img) for img in non_cancerous_images]
cancerous_images_resized = [preprocess_image(img) for img in cancerous_images]


#Normalization
non_cancerous_images_normalized = [img / 255.0 for img in non_cancerous_images_resized]
cancerous_images_normalized = [img / 255.0 for img in cancerous_images_resized]


NameError: name 'non_cancerous_images' is not defined

CNN

In [None]:
import os
import numpy as np

# Define paths
cancerous_path = "data/cancerous"
non_cancerous_path = "data/non_cancerous"

# Collect file paths
cancerous_images = [os.path.join(cancerous_path, img) for img in os.listdir(cancerous_path)]
non_cancerous_images = [os.path.join(non_cancerous_path, img) for img in os.listdir(non_cancerous_path)]

# Create labels
cancerous_labels = [1] * len(cancerous_images)
non_cancerous_labels = [0] * len(non_cancerous_images)

# Combine and shuffle
file_paths = cancerous_images + non_cancerous_images
labels = cancerous_labels + non_cancerous_labels

# Shuffle the data
combined = list(zip(file_paths, labels))
np.random.shuffle(combined)
file_paths, labels = zip(*combined)

In [None]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Define image dimensions
image_height, image_width = 512,512

# Load and preprocess images
X = np.array([img_to_array(load_img(img, target_size=(image_height, image_width))) for img in file_paths])
y = np.array(labels)

# Normalize pixel values (RGB)
X = X / 255.0

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)


In [None]:


X = non_cancerous_images_normalized + cancerous_images_normalized
y = [0] * len(non_cancerous_images_normalized) + [1] * len(cancerous_images_normalized)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Convert lists to arrays
X_train = np.array(X_train).reshape(-1, 224, 224, 1)  # Add channel dimension if grayscale
X_test = np.array(X_test).reshape(-1, 224, 224, 1)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (636, 224, 224, 1)
X_test shape: (159, 224, 224, 1)
y_train shape: (636,)
y_test shape: (159,)


In [None]:
X_train

array([[[[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]],

        [[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]],

        [[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]],

        ...,

        [[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]],

        [[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]],

        [[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]]],


       [[[-4.01568627],


In [None]:
X_test

array([[[[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]],

        [[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]],

        [[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]],

        ...,

        [[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]],

        [[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]],

        [[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]]],


       [[[-7.84313725],


In [None]:
y_train

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

In [None]:
y_test

array([1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0])

In [None]:
train_test_split(y, shuffle=False)

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


Check Class Distribution in Train and Test Sets

In [50]:
for j, i in enumerate(test_data):
    # Assuming y_train and y_test are your labels for the train and test sets
    train_class_distribution = pd.Series(i[2]).value_counts(normalize=True)
    test_class_distribution = pd.Series(i[3]).value_counts(normalize=True)

    print(f"Class distribution in training set {j+1}:")
    print(train_class_distribution)
    print(f"\nClass distribution in testing set {j+1}:")
    print(test_class_distribution)

Class distribution in training set 1:
0    0.976096
1    0.023904
Name: proportion, dtype: float64

Class distribution in testing set 1:
0    0.975057
1    0.024943
Name: proportion, dtype: float64
Class distribution in training set 2:
0    0.976096
1    0.023904
Name: proportion, dtype: float64

Class distribution in testing set 2:
0    0.975057
1    0.024943
Name: proportion, dtype: float64
Class distribution in training set 3:
0    0.976096
1    0.023904
Name: proportion, dtype: float64

Class distribution in testing set 3:
0    0.975057
1    0.024943
Name: proportion, dtype: float64


Cross validation and bootstrapping