Import required modules

In [1]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import nibabel as nib
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import patient_data

Access the folder path for the cancer and the non-cancer images

In [9]:
all_paths = json.loads(open("./paths.json").read())

personal_path = all_paths['personal_path']
non_cancerous_path = personal_path + all_paths['non_cancerous_path']
cancerous_path = personal_path + all_paths['cancerous_path']

Load in all the DICOM files

In [10]:
# Using the patient_data data structure, load in all the patient data and save it in a dictionary with the folder name as the key
def load_all_patients(path):
    patients = {}
    folder = os.listdir(path)
    for name in folder:
        patients[name] = patient_data.Patient(os.path.join(path, name))
    return patients

nc_patients = load_all_patients(non_cancerous_path)
c_patients = load_all_patients(cancerous_path)

Save images from dicoms as JPEGS for visualization

In [12]:
def convert_to_jpg(patients, output_folder):
    for patient in patients:
        path = os.path.join(output_folder, patient)
        patients[patient].save_data_as_images(path)

cip = personal_path + "/all_images/cancerous/"
ncip = personal_path + "/all_images/non_cancerous/"

convert_to_jpg(c_patients, cip)
convert_to_jpg(nc_patients, ncip)

Overlay annotations on CT images and produce JPEGS for visualization

In [None]:
def save_overlays(patients, output_folder):
    for patient in patients.keys():
        path = os.path.join(output_folder, patient)
        patients[patient].overlay_seg_on_ct(path)
    return 

cip = personal_path + "/all_images/cancerous/"
ncip = personal_path + "/all_images/non_cancerous/"

save_overlays(c_patients, cip)
save_overlays(nc_patients, ncip)

[('61.7.225671413567587988709540514214837207016', 0), ('61.7.172515939084005252437456394015127410395', 1), ('61.7.311273956813205555334922523590492390726', 2), ('61.7.49239243940168297994111164026998337905', 3), ('61.7.46898277701274339226472024127606534162', 4), ('61.7.269707611950866927630519493863211594541', 5), ('61.7.62418700203484400719700631085943471490', 6), ('61.7.17033528246777316687879153860121593984', 7), ('61.7.172672028699375990388348736984503214630', 8), ('61.7.197257886739675122852389097900610672269', 9), ('61.7.39922166297242178900738436904830470529', 10), ('61.7.216694197039041528108241957020867795488', 11), ('61.7.282374800196595087812314209346209916245', 12), ('61.7.144413207071655021970347709406599477020', 13), ('61.7.219239145993323193706255940924546086765', 14), ('61.7.225671413567587988709540514214837207016', 15), ('61.7.172515939084005252437456394015127410395', 16), ('61.7.311273956813205555334922523590492390726', 17), ('61.7.49239243940168297994111164026998337

Preprocess Images for CNN

In [9]:
#Resize images
def preprocess_image(img, target_size=(224, 224)):
    img_resized = cv2.resize(img, target_size)
    return img_resized

non_cancerous_images_resized = [preprocess_image(img) for img in non_cancerous_images]
cancerous_images_resized = [preprocess_image(img) for img in cancerous_images]


#Normalization
non_cancerous_images_normalized = [img / 255.0 for img in non_cancerous_images_resized]
cancerous_images_normalized = [img / 255.0 for img in cancerous_images_resized]


NameError: name 'non_cancerous_images' is not defined

CNN

In [14]:
import os
import numpy as np

# Define paths
cancerous_path = "data/cancerous"
non_cancerous_path = "data/non_cancerous"

# Collect file paths
cancerous_images = [os.path.join(cancerous_path, img) for img in os.listdir(cancerous_path)]
non_cancerous_images = [os.path.join(non_cancerous_path, img) for img in os.listdir(non_cancerous_path)]

# Create labels
cancerous_labels = [1] * len(cancerous_images)
non_cancerous_labels = [0] * len(non_cancerous_images)

# Combine and shuffle
file_paths = cancerous_images + non_cancerous_images
labels = cancerous_labels + non_cancerous_labels

# Shuffle the data
combined = list(zip(file_paths, labels))
np.random.shuffle(combined)
file_paths, labels = zip(*combined)

NameError: name 'non_cancerous_images_normalized' is not defined

In [None]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Define image dimensions
image_height, image_width = 512,512

# Load and preprocess images
X = np.array([img_to_array(load_img(img, target_size=(image_height, image_width))) for img in file_paths])
y = np.array(labels)

# Normalize pixel values (RGB)
X = X / 255.0

In [None]:


X = non_cancerous_images_normalized + cancerous_images_normalized
y = [0] * len(non_cancerous_images_normalized) + [1] * len(cancerous_images_normalized)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Convert lists to arrays
X_train = np.array(X_train).reshape(-1, 224, 224, 1)  # Add channel dimension if grayscale
X_test = np.array(X_test).reshape(-1, 224, 224, 1)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (636, 224, 224, 1)
X_test shape: (159, 224, 224, 1)
y_train shape: (636,)
y_test shape: (159,)


In [None]:
X_train

array([[[[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]],

        [[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]],

        [[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]],

        ...,

        [[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]],

        [[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]],

        [[-4.01568627],
         [-4.01568627],
         [-4.01568627],
         ...,
         [-4.01568627],
         [-4.01568627],
         [-4.01568627]]],


       [[[-4.01568627],


In [None]:
X_test

array([[[[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]],

        [[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]],

        [[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]],

        ...,

        [[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]],

        [[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]],

        [[-7.84313725],
         [-7.84313725],
         [-7.84313725],
         ...,
         [-7.84313725],
         [-7.84313725],
         [-7.84313725]]],


       [[[-7.84313725],


In [None]:
y_train

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

In [None]:
y_test

array([1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0])

In [None]:
train_test_split(y, shuffle=False)

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


Check Class Distribution in Train and Test Sets

In [None]:

# Assuming y_train and y_test are your labels for the train and test sets
train_class_distribution = pd.Series(y_train).value_counts(normalize=True)
test_class_distribution = pd.Series(y_test).value_counts(normalize=True)

print("Class distribution in training set:")
print(train_class_distribution)
print("\nClass distribution in testing set:")
print(test_class_distribution)

Class distribution in training set:
0    0.836478
1    0.163522
Name: proportion, dtype: float64

Class distribution in testing set:
0    0.811321
1    0.188679
Name: proportion, dtype: float64


Cross validation and bootstrapping