# Data Loading

In [1]:
import pydicom as dicom
import os 
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import numpy as np
import math
import tensorflow as tf

data_dir = './CTdataset'
patients = os.listdir(data_dir)
labels_df = pd.read_csv('./labels.csv', index_col=0)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Data Preprocessing

In [None]:

IMG_SIZE_PX = 50
SLICE_COUNT = 20

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


def mean(a):
    return sum(a) / len(a)


def process_data(patient,labels_df,img_px_size=50, hm_slices=20, visualize=False):
    
    #label = labels_df.get_value(patient, 'cancer')
    label = [0,1,0,0,1,0,0,1,0,1]
    path = data_dir + patient
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: int(x.ImagePositionPatient[2]))

    new_slices = []
    slices = [cv2.resize(np.array(each_slice.pixel_array),(img_px_size,img_px_size)) for each_slice in slices]
    
    chunk_sizes = math.ceil(len(slices) / hm_slices)
    for slice_chunk in chunks(slices, chunk_sizes):
        slice_chunk = list(map(mean, zip(*slice_chunk)))
        new_slices.append(slice_chunk)

    if len(new_slices) == hm_slices-1:
        new_slices.append(new_slices[-1])

    if len(new_slices) == hm_slices-2:
        new_slices.append(new_slices[-1])
        new_slices.append(new_slices[-1])

    if len(new_slices) == hm_slices+2:
        new_val = list(map(mean, zip(*[new_slices[hm_slices-1],new_slices[hm_slices],])))
        del new_slices[hm_slices]
        new_slices[hm_slices-1] = new_val
        
    if len(new_slices) == hm_slices+1:
        new_val = list(map(mean, zip(*[new_slices[hm_slices-1],new_slices[hm_slices],])))
        del new_slices[hm_slices]
        new_slices[hm_slices-1] = new_val

    if visualize:
        fig = plt.figure()
        for num,each_slice in enumerate(new_slices):
            y = fig.add_subplot(4,5,num+1)
            y.imshow(each_slice, cmap='gray')
        plt.show()

    if label == 1: label=np.array([0,1])
    elif label == 0: label=np.array([1,0])
        
    return np.array(new_slices),label

much_data = []
for num,patient in enumerate(patients):
    if num % 100 == 0:
        print(num)
    try:
        img_data,label = process_data(patient,label,img_px_size=IMG_SIZE_PX, hm_slices=SLICE_COUNT)
        #print(img_data.shape,label)
        much_data.append([img_data,label])
    except KeyError as e:
        print('This is unlabeled data!')

np.save('muchdata-{}-{}-{}.npy'.format(IMG_SIZE_PX,IMG_SIZE_PX,SLICE_COUNT), much_data)

# 3D-CNN

In [4]:
# 超参数设置
batch_size = 10
keep_rate = 0.8
train_epochs = 3
learning_rate=1e-3


# 普通参数设置
IMG_SIZE_PX = 50
SLICE_COUNT = 20
n_classes = 2
save_step=1


# 定义待输入数据
x = tf.placeholder(tf.float32,name="x")
y = tf.placeholder(tf.float32,name="y")


# 定义变量
#  5 x 5 x 5 patches, 1 channel, 32 features to compute.
weights = {'W_conv1':tf.Variable(tf.random_normal([3,3,3,1,32])),
           # 5 x 5 x 5 patches, 32 channels, 64 features to compute.
           'W_conv2':tf.Variable(tf.random_normal([3,3,3,32,64])),
           # 64 features
           'W_fc':tf.Variable(tf.random_normal([54080,1024])),
           'out':tf.Variable(tf.random_normal([1024, n_classes]))}

biases = {'b_conv1':tf.Variable(tf.random_normal([32])),
          'b_conv2':tf.Variable(tf.random_normal([64])),
          'b_fc':tf.Variable(tf.random_normal([1024])),
          'out':tf.Variable(tf.random_normal([n_classes]))}

#image_shaped_input = tf.reshape(x,[-1,50 ,50 ,1])
#tf.summary.image('input',image_shaped_input, 10)



# 前向计算
def conv3d(x, W):
    return tf.nn.conv3d(x, W, strides=[1,1,1,1,1], padding='SAME')

def maxpool3d(x):
    #  size of window  movement of window as you slide about
    return tf.nn.max_pool3d(x, ksize=[1,2,2,2,1], strides=[1,2,2,2,1], padding='SAME')

def convolutional_neural_network(x):
    
    #  image X  image Y  image Z
    x = tf.reshape(x, shape=[-1, IMG_SIZE_PX, IMG_SIZE_PX, SLICE_COUNT, 1])

    conv1 = tf.nn.relu(conv3d(x, weights['W_conv1']) + biases['b_conv1'])
    conv1 = maxpool3d(conv1)


    conv2 = tf.nn.relu(conv3d(conv1, weights['W_conv2']) + biases['b_conv2'])
    conv2 = maxpool3d(conv2)

    fc = tf.reshape(conv2,[-1, 54080])
    fc = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
    fc = tf.nn.dropout(fc, keep_rate)

    output = tf.matmul(fc, weights['out'])+biases['out']
    tf.summary.histogram('forward',output)

    return output




# 定义 prediction & loss_function & optimizer
prediction = convolutional_neural_network(x)

loss_function = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y) )
tf.summary.scalar('loss',loss_function)
    
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss_function)

# 检查预测类别与实际类别的匹配情况
correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
# 计算准确率
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy',accuracy)


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



<tf.Tensor 'accuracy:0' shape=() dtype=string>

In [5]:
def train_neural_network(x):
    
    much_data = np.load('muchdata-50-50-20.npy',allow_pickle= True)
    train_data = much_data[:-5]
    validation_data = much_data[-5:]

    # 创建保存模型目录   
    ckpt_dir="./ckpt_dir/"
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)
        
    
    with tf.Session() as sess:
        
        init = tf.global_variables_initializer()
        sess.run(init)
        
        merged_summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter('C:/Users/cheng/Desktop/TFOD/Data_Science_Bowl_2017/log',sess.graph) #创建写入符
        
        successful_runs = 0
        total_runs = 0
        
        saver = tf.train.Saver() # 模型保存
            
        for epoch in range(train_epochs):
            epoch_loss = 0
            for data in train_data:
                total_runs += 1
                try:
                    # 读取批次数据
                    X = data[0]
                    Y = data[1]
                    # 执行批次训练
                    _, c = sess.run([optimizer, loss_function], feed_dict={x: X, y: Y})
                    
                   
                    
                    # 生成summary
                    summary_str = sess.run(merged_summary_op,feed_dict={x: X, y: Y})
                    writer.add_summary(summary_str, epoch)
                    
                    epoch_loss += c
                    successful_runs += 1
                except Exception as e:
                    pass
            
            print('Epoch', epoch+1, 'completed out of',train_epochs,'loss:',epoch_loss)

            
            merged_summary_op = tf.summary.merge_all()
            writer = tf.summary.FileWriter('log/', sess.graph)
            
            

            print('Accuracy:',accuracy.eval({x:[i[0] for i in validation_data], y:[i[1] for i in validation_data]}))
            
            # 模型保存
            if (epoch+1)%save_step == 0:
                saver.save(sess , os.path.join(ckpt_dir,
                                            'model_Lung_cancer{:06d}.ckpt'.format(epoch+1)))
                print('model_Lung_cancer{:06d}.ckpt saved'.format(epoch+1))
                
                
                           
        print('Done. Finishing accuracy:')
        print('Accuracy:',accuracy.eval({x:[i[0] for i in validation_data], y:[i[1] for i in validation_data]}))
        print('fitment percent:',successful_runs/total_runs)
                      

In [6]:
train_neural_network(x)

Epoch 1 completed out of 3 loss: 0
Accuracy: 0.6
model_Lung_cancer000001.ckpt saved
Epoch 2 completed out of 3 loss: 0
Accuracy: 0.2
model_Lung_cancer000002.ckpt saved
Epoch 3 completed out of 3 loss: 0
Accuracy: 0.6
model_Lung_cancer000003.ckpt saved
Done. Finishing accuracy:
Accuracy: 0.8
fitment percent: 0.0
