In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow as tf
from utils.preprocessing import *
import sys

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import matplotlib.pyplot as plt
from keras.backend import *
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior() # enables np methods on tensors

In [2]:
#testing_dir = '../data/C-NMC_Leukemia/testing_data/C-NMC_test_final_phase_data' ### WE DO NOT USE THIS
training_dir_base = f'../data/C-NMC_Leukemia/training_data/' 
fold_0_all_path = 'fold_0/all'
fold_0_hem_path = 'fold_0/hem'
fold_1_all_path = 'fold_1/all'
fold_1_hem_path = 'fold_1/hem'

In [9]:
### The code for this lives in notebooks.utils.preprocessing
### Get grayscale images as tensors, and labels for both classes
img_train_cancer = get_gray_images(training_dir_base+fold_0_all_path)
img_train_healthy = get_gray_images(training_dir_base+fold_0_hem_path)
train_health_labels = [0 for _ in range(len(img_train_healthy))]
train_cancer_labels = [1 for _ in range(len(img_train_cancer))]

## Random Forests

In [6]:
def randomForestAccuracy(img_cancer, img_healthy, labels_healthy, labels_cancer, nest=100, mssplit=2, mdepth=None, rstate=100):
    ### For random forests, X is our combined image set and y is our combined label set.
    X =  np.array(img_cancer + img_healthy)
    y = np.array(labels_cancer + labels_healthy)

    ### train-test-split our data, and initialize the forest.
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=100)

    forest = RandomForestClassifier(n_estimators=nest, random_state=rstate, min_samples_split=mssplit, max_depth=100)

    ### The random forests algorithm expects 2 dimensional data at the most.
    ### We can accomplish this on our 4-dimensional data by multiplying the last 3 dimensions together to flatten the data
    X_shape = X_train.shape
    X_tshape = X_test.shape
    X_train = X_train.reshape(X_shape[0], X_shape[1] * X_shape[2] * X_shape[3])
    X_test = X_test.reshape(X_tshape[0], X_tshape[1] * X_tshape[2] * X_tshape[3])

    ### Fit the random forest, try it on the test data, and assess the model's accuracy
    forest.fit(X_train,y_train)
    predictions = forest.predict(X_test)
    return metrics.accuracy_score(y_test, predictions)

In [7]:
accuracy_original = randomForestAccuracy(img_train_cancer, img_train_healthy, train_health_labels, train_cancer_labels)

In [8]:
### Random forests, color
### We will repeat the steps above, with color images instead.
img_train_cancer = get_gray_images(training_dir_base+fold_0_all_path, 128)
img_train_healthy = get_gray_images(training_dir_base+fold_0_hem_path, 128)

In [9]:
accuracy_128 = randomForestAccuracy(img_train_cancer, img_train_healthy, train_health_labels, train_cancer_labels)

In [10]:
### I find that there is a negligible (0.4%) difference between full size and 128x128 downscaled images.
### Because downscaled images process faster, I will use those
print(accuracy_original)
print(accuracy_128)

0.8593012275731823
0.8630783758262511


In [11]:
### Assemble combinations of hyperparameters for tuning.
from itertools import product
_n_estimators = [10, 50, 100]
_min_samples_split = [2, 5, 10]
_max_depth = [10, 100, None]
_random_state = [50, 100, 256]
list_of_lists = [_n_estimators, _min_samples_split, _max_depth, _random_state]
all_combinations = list(product(*list_of_lists))

In [12]:
### Find the highest accuracy combination of hyperparameters.
max_accuracy = -1
max_accuracy_params = []
for combination in all_combinations:
    accuracy = randomForestAccuracy(img_train_cancer, img_train_healthy, train_health_labels, train_cancer_labels, 
                                    nest=combination[0], mssplit=combination[1], mdepth=combination[2], rstate=combination[3])
    
    if accuracy > max_accuracy:
        max_accuracy = accuracy
        max_accuracy_params = combination

In [13]:
print(max_accuracy)
print(max_accuracy_params)

0.8649669499527857
(100, 10, 10, 100)


# Perceptron

Perceptron: Grayscale Images

In [10]:
grayCancer = get_gray_images(training_dir_base+fold_0_all_path)
grayHealthy = get_gray_images(training_dir_base+fold_0_hem_path)

trainCancerLabels = [1 for _ in range(len(grayCancer))]
trainHealthLabels = [0 for _ in range(len(grayHealthy))]

images, labels = grayCancer + grayHealthy, trainCancerLabels + trainHealthLabels # concatenate cancer and healthy images, as well as their labels into combined image and label vectors
images, labels = np.array(images), np.array(labels)

# convert to np arrays for training
grayImages = np.array(images).reshape(len(images), 450, 450)
grayLabels = np.array(labels)

In [11]:
# shuffle labels and images in unison so they maintain their image-label correspondance
randomize = np.arange(len(grayImages))
np.random.shuffle(randomize)
grayImages = grayImages[randomize]
grayLabels = grayLabels[randomize]

In [5]:
with tf.device('/CPU:0'): # tf uses GPU by default, use this if your config is out of whack like mine is -C.J.
                            # otherwise use - with tf.device('/GPU:0) or remove code from under with statement
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape = [450, 450]),
        tf.keras.layers.Dense(1, activation='sigmoid'), # Dense operation is output = activation(dot(input, kernel) + bias)
    ])

    model.compile(optimizer='adam', 
                loss=tf.keras.losses.BinaryCrossentropy(),                
                metrics=['accuracy'])

    #print(images.shape, labels.shape)

    model.fit(grayImages, grayLabels, epochs = 20, batch_size = 10)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
### Create testing data
grayCancerTest = get_gray_images(training_dir_base+fold_1_all_path)
grayHealthyTest = get_gray_images(training_dir_base+fold_1_hem_path)

testCancerLabels = [1 for _ in range(len(grayCancerTest))]
testHealthLabels = [0 for _ in range(len(grayHealthyTest))]

images, labels = grayCancerTest + grayHealthyTest, testCancerLabels + testHealthLabels # concatenate cancer and healthy images, as well as their labels into combined image and label vectors
images, labels = np.array(images), np.array(labels)

# convert to np arrays for training
grayImagesTest = np.array(images).reshape(len(images), 450, 450)
grayLabelsTest = np.array(labels)

In [7]:
### Test model on test data
model.evaluate(grayImagesTest, grayLabelsTest)



[173.95147705078125, 0.7052947282791138]

Perceptron: HSV Images

In [5]:
hsvCancer = get_hsv_images(training_dir_base+fold_0_all_path)
hsvHealthy = get_hsv_images(training_dir_base+fold_0_hem_path)

trainCancerLabels = [1 for _ in range(len(hsvCancer))]
trainHealthLabels = [0 for _ in range(len(hsvHealthy))]

images, labels = hsvCancer + hsvHealthy, trainCancerLabels + trainHealthLabels # concatenate cancer and healthy images, as well as their labels into combined image and label vectors
images, labels = np.array(images), np.array(labels)

# convert to np arrays for training
hsvImages = np.array(images).reshape(len(images), 450, 450, 3)
hsvLabels = np.array(labels)
#plt.imshow(hsvImages[0])

In [6]:
# shuffle labels and images in unison so they maintain their image-label correspondance
randomize = np.arange(len(hsvImages))
np.random.shuffle(randomize)
hsvImages = hsvImages[randomize]
hsvLabels = hsvLabels[randomize]

In [7]:
with tf.device('/CPU:0'): 
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape = [450, 450, 3]),
        tf.keras.layers.Dense(1, activation='sigmoid'), # Dense operation is output = activation(dot(input, kernel) + bias)
    ])

    model.compile(optimizer='adam', 
                loss=tf.keras.losses.BinaryCrossentropy(),                
                metrics=['accuracy'])

    model.fit(hsvImages, hsvLabels, epochs = 10, batch_size = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [3]:
### Create testing data
hsvCancerTest = get_hsv_images(training_dir_base+fold_1_all_path)
hsvHealthyTest = get_hsv_images(training_dir_base+fold_1_hem_path)

testCancerLabels = [1 for _ in range(len(hsvCancerTest))]
testHealthLabels = [0 for _ in range(len(hsvHealthyTest))]

images, labels = hsvCancerTest + hsvHealthyTest, testCancerLabels + testHealthLabels # concatenate cancer and healthy images, as well as their labels into combined image and label vectors
images, labels = np.array(images), np.array(labels)

# convert to np arrays for training
hsvImagesTest = np.array(images).reshape(len(images), 450, 450, 3)
hsvLabelsTest = np.array(labels)

In [8]:
### Test model on test data
model.evaluate(hsvImagesTest, hsvLabelsTest)



[187.02383422851562, 0.5874125957489014]

Perceptron: Saturated Images

In [3]:
satCancer = get_saturated_images(training_dir_base+fold_0_all_path)
satHealthy = get_saturated_images(training_dir_base+fold_0_hem_path)

trainCancerLabels = [1 for _ in range(len(satCancer))]
trainHealthLabels = [0 for _ in range(len(satHealthy))]

images, labels = satCancer + satHealthy, trainCancerLabels + trainHealthLabels # concatenate cancer and healthy images, as well as their labels into combined image and label vectors
images, labels = np.array(images), np.array(labels)

# convert to np arrays for training
satImages = np.array(images).reshape(len(images), 450, 450, 3)
satLabels = np.array(labels)

In [4]:
# shuffle labels and images in unison so they maintain their image-label correspondance
randomize = np.arange(len(satImages))
np.random.shuffle(randomize)
satImages = satImages[randomize]
satLabels = satLabels[randomize]

In [5]:
with tf.device('/CPU:0'): 
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape = [450, 450, 3]),
        tf.keras.layers.Dense(1, activation='sigmoid'), # Dense operation is output = activation(dot(input, kernel) + bias)
    ])

    model.compile(optimizer='adam', 
                loss=tf.keras.losses.BinaryCrossentropy(),                
                metrics=['accuracy'])

    model.fit(satImages, satLabels, epochs = 10, batch_size = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Perceptron: Saturated Images

In [6]:
### Create testing data
satCancerTest = get_saturated_images(training_dir_base+fold_1_all_path)
satHealthyTest = get_saturated_images(training_dir_base+fold_1_hem_path)

testCancerLabels = [1 for _ in range(len(satCancerTest))]
testHealthLabels = [0 for _ in range(len(satHealthyTest))]

images, labels = satCancerTest + satHealthyTest, testCancerLabels + testHealthLabels # concatenate cancer and healthy images, as well as their labels into combined image and label vectors
images, labels = np.array(images), np.array(labels)

# convert to np arrays for training
satImagesTest = np.array(images).reshape(len(images), 450, 450, 3)
satLabelsTest = np.array(labels)
### usually works, currently expirencing resource exhausted error- plz try on your machine if you pull this -C.J.

ResourceExhaustedError: {{function_node __wrapped__AdjustSaturation_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[450,450,3] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:AdjustSaturation]

In [11]:
### Test model on test data
model.evaluate(hsvImagesTest, hsvLabelsTest)

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.