# Data Preprocessing
### Brendon & Alec Barrios     |     08/08/2020
Using OSIC Pulmonary Fibrosis Progression dataset from Kaggle.com

<https://www.kaggle.com/c/osic-pulmonary-fibrosis-progression/overview>

Adapted from: <https://www.kaggle.com/sentdex/first-pass-through-data-w-3d-convnet>

In [1]:
import os
import pydicom
import pandas as pd

DATA_DIR = "train_dl:"
patients = os.listdir(DATA_DIR)

labels_df = pd.read_csv("train.csv", index_col=0)

labels_df.head(20)

Unnamed: 0_level_0,Weeks,FVC,Percent,Age,Sex,SmokingStatus
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker
ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker
ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker
ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker
ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked


In [2]:
import matplotlib.pyplot as plt
import cv2
import numpy as np
import math

IMAGE_PX_SIZE = 80  # recommended minimum: 70
NUM_SLICES = 30     # recommended minimum: 20
SAVE_FILE = "traindata-{}x{}x{}.npy".format(IMAGE_PX_SIZE, IMAGE_PX_SIZE, NUM_SLICES)

def chunks(l, n):
    # Credit: Ned Batchelder
    # Link: http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

def mean(l):
    return sum(l)/len(l)

def normalize(a):
    norm = np.linalg.norm(a)
    if norm == 0:
        return a
    return a / norm

def process_data(patient, labels_df, img_px_size=70, num_slices=20, visualize=False):
    
    path = os.path.join(DATA_DIR, patient)
    slices = [pydicom.read_file(os.path.join(path, file)) for file in os.listdir(path)]
    slices.sort(key = lambda x: int(x.ImagePositionPatient[2])) # sorts dicom files by Image Position
#     print(slices[0])
    
    new_slices = []
    slices = [cv2.resize(normalize(each_slice.pixel_array), (img_px_size, img_px_size)) for each_slice in slices]
           
    chunk_step = math.ceil(len(slices) / num_slices) # (num_slices^2 + 1)/num_slices
    
    # Chunks and averages images for patients with >40 images
    for slice_chunk in chunks(slices, chunk_step):
        avg_slice = list(map(mean, zip(*slice_chunk)))
        new_slices.append(avg_slice)
    
    print(len(new_slices))    
    
    # Handle data with less than num_slices images
    diff = num_slices - len(new_slices)
    if diff:
        for n in range(diff):
            mid = int(len(new_slices) / 2)
            # possibly mirror the duplicate images(?)
            new_slices.append(new_slices[mid])
        
    # Handle data with more than num_slices images
    while len(new_slices) > num_slices:
        new_img = list(map(mean, zip(*[new_slices[-1], new_slices[-2]])))
        del new_slices[num_slices]
        new_slices[num_slices - 1] = new_img
        
    print(len(new_slices))
    
    if visualize:
        cols = int(num_slices / 5)
        fig = plt.figure(figsize=(16,12)) # double the default figsize
        for num, each_slice in enumerate(new_slices):
            y = fig.add_subplot(5, cols, num+1)
            y.imshow(each_slice, cmap = "gray")
        plt.show
        
    '''
    Obtain slope of FVC Score as label
    Possibly check R^2 values as a counterpart to confidence score
    Check if non-linear functions describe FVC trends better than linear functions
    If so, brainstorm a numeric readout (to replace slope) from the non-linear f(x) that works best
    '''
    FVC = np.array(labels_df.at[patient, "FVC"])
    wk = np.array(labels_df.at[patient, "Weeks"])
    m, b = np.polyfit(wk, FVC, 1)
        
    return np.array(new_slices), -m, b

In [4]:
"""
Try Preprocessing Data 
If memory fails: try to process "online", meaning call process_data() while network is training to feed data into network
"""
much_data = []

for num, patient in enumerate(patients[:5]):
    if num%100 == 0 and num != 0:
        print(num)
    
    """
    This is a work-around for files that require GDCM to read. Try using get_pixeldata() method to work.
    """      
    try:
        img_data, label, intercept = process_data(patient, 
                                                  labels_df, 
                                                  img_px_size=IMAGE_PX_SIZE, 
                                                  num_slices=NUM_SLICES, 
                                                  visualize=False)
        
        much_data.append([img_data, label, intercept])
        
    except KeyError:
        print("Unlabeled data!") # error that Sentdex handled
        
    except RuntimeError:
        print("GDCM required!") # error that we are encountering
        
np.save(SAVE_FILE, much_data, allow_pickle=True)
print("Save successful!")

30
30


TypeError: expected 1D vector for x

In [None]:
FVC = np.array(labels_df.at[patient, "FVC"])
wk = np.array(labels_df.at[patient, "Weeks"])
m, b = np.polyfit(wk, FVC, 1)
plt.plot(wk, FVC, 'o')
plt.plot(wk, m*wk + b)
print(m)
print(-m)

In [None]:
"""
Proof of Save/Load states:
"""
images = np.load(SAVE_FILE, allow_pickle=True)

In [None]:
print("Raw Data: ", images[3])

In [None]:
print("Normalized Image Data:\n", images[3,0][1])
plt.figure()
plt.imshow(images[3,0][25], cmap='gray')
plt.show()

### Proof of concept: Loading in training/validation data

In [None]:
x_train = []
y_train = []
b = []
for n in range(len(images)):
    x_train.append(images[n, 0])
    y_train.append(images[n, 1])
    b.append(images[n, 2])
X = np.array(x_train)
Y = np.array(y_train)
print(X.shape)
print(Y.shape)
50**3

### 3D_ConvNet Model: based on sentdex modified by Alec Barrios

https://pythonprogramming.net/cnn-tensorflow-convolutional-nerual-network-machine-learning-tutorial/

In [6]:
import tensorflow as tf
import nunmpy as np

#IMAGE_PX_SIZE = 80  # created in earlier cell
#NUM_SLICES = 30   

n_classes = 2

x = tf.placeholder('float')
y = tf.placeholder('float')

keep_rate = 0.8
keep_prob = tf.placeholder(tf.float32)

def conv3d(x, W):
    return tf.nn.conv3d(x, W, strides=[1,1,1,1,1], padding='SAME')

def maxpool3d(x):
    #                        size of window         movement of window
    return tf.nn.max_pool3d(x, ksize=[1,2,2,2,1], strides=[1,2,2,2,1], padding='SAME')



def convolutional_neural_network(x):
    weights = {'W_conv1':tf.Variable(tf.random_normal([3,3,3,1,32])),
               'W_conv2':tf.Variable(tf.random_normal([3,3,3,32,64])),
               'W_fc':tf.Variable(tf.random_normal([7*7*64,1024])), #first argument is wrong
               'out':tf.Variable(tf.random_normal([1024, n_classes]))}

    biases = {'b_conv1':tf.Variable(tf.random_normal([32])),
               'b_conv2':tf.Variable(tf.random_normal([64])),
               'b_fc':tf.Variable(tf.random_normal([1024])),
               'out':tf.Variable(tf.random_normal([n_classes]))}

    x = tf.reshape(x, shape=[-1, IMAGE_PX_SIZE, IMAGE_PX_SIZE, NUM_SLICES, 1])

    conv1 = tf.nn.relu(conv3d(x, weights['W_conv1']) + biases['b_conv1'])
    conv1 = maxpool3d(conv1)
    
    conv2 = tf.nn.relu(conv3d(conv1, weights['W_conv2']) + biases['b_conv2'])
    conv2 = maxpool3d(conv2)

    fc = tf.reshape(conv2,[-1, 7*7*64]) #second argument is wrong
    fc = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
    fc = tf.nn.dropout(fc, keep_rate)

    output = tf.matmul(fc, weights['out'])+biases['out']

    return output

def train_neural_network(x):
    
    much_data = np.load("testdata-80x80x30.npy")
    train_data = much_data[:-100]
    validation_data = much_data[-100:]
    
    prediction = convolutional_neural_network(x)
    cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(prediction,y) )
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    hm_epochs = 3
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())

        for epoch in range(hm_epochs):
            epoch_loss = 0
            for data in train_data:
                X = data[0]
                Y = data[1]
                _, c = sess.run([optimizer, cost], feed_dict={x: X, y: Y})
                epoch_loss +=c

            print('Epoch', epoch, 'completed out of',hm_epochs,'loss:',epoch_loss)

        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))

        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        print('Accuracy:',accuracy.eval({x:[i[0] for i in validation_data], y:[i[1] for i in validation_data]})

train_neural_network(x)

SyntaxError: invalid syntax (<ipython-input-6-6bfb3f7fd3d8>, line 80)