In [1]:
import os
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import  tensorflow as tf
from    tensorflow import keras
from    tensorflow.keras import layers, Sequential
from    tensorflow.keras.callbacks import EarlyStopping 
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
import pandas as pd
import numpy as np

def return_tumor_or_not(dic, one_id):
    return dic[one_id]

def create_dict():
    df = pd.read_csv("data/train_labels.csv")
    result_dict = {}
    for index in range(df.shape[0]):
        one_id = df.iloc[index,0]
        tumor_or_not = df.iloc[index,1]
        result_dict[one_id] = int(tumor_or_not)
    return result_dict

def find_missing(train_ids, cv_ids):
    all_ids = set(pd.read_csv("data/train_labels.csv")['id'].values)
    wsi_ids = set(train_ids + cv_ids)

    missing_ids = list(all_ids-wsi_ids)
    return missing_ids


def generate_split():
    ids = pd.read_csv("data/patch_id_wsi.csv")
    wsi_dict = {}
    for i in range(ids.shape[0]):
        wsi = ids.iloc[i,1]
        train_id = ids.iloc[i,0]
        wsi_array = wsi.split('_')
        number = int(wsi_array[3])
        if wsi_dict.get(number) is None:
            wsi_dict[number] = [train_id]
        else:
            wsi_dict[number].append(train_id)

    wsi_keys = list(wsi_dict.keys())
    np.random.seed()
    np.random.shuffle(wsi_keys)
    amount_of_keys = len(wsi_keys)

    keys_for_train = wsi_keys[0:int(amount_of_keys*0.8)]
    keys_for_cv = wsi_keys[int(amount_of_keys*0.8):]
    train_ids = []
    cv_ids = []

    for key in keys_for_train:
        train_ids += wsi_dict[key]

    for key in keys_for_cv:
        cv_ids += wsi_dict[key]

    dic = create_dict()

    missing_ids = find_missing(train_ids, cv_ids)
    missing_ids_total = len(missing_ids)
    np.random.seed()
    np.random.shuffle(missing_ids)

    train_missing_ids = missing_ids[0:int(missing_ids_total*0.8)]
    cv_missing_ids = missing_ids[int(missing_ids_total*0.8):]

    train_ids += train_missing_ids
    cv_ids += cv_missing_ids

    train_labels = []
    cv_labels = []

    train_tumor = 0
    for one_id in train_ids:
        temp = return_tumor_or_not(dic, one_id)
        train_tumor += temp
        train_labels.append(temp)

    cv_tumor = 0
    for one_id in cv_ids:
        temp = return_tumor_or_not(dic, one_id)
        cv_tumor += temp
        cv_labels.append(temp)
    total = len(train_ids) + len(cv_ids)

    print("Amount of train labels: {}, {}/{}".format(len(train_ids), train_tumor, len(train_ids)-train_tumor))
    print("Amount of cv labels: {}, {}/{}".format(len(cv_ids), cv_tumor, len(cv_ids) - cv_tumor))
    print("Percentage of cv labels: {}".format(len(cv_ids)/total))

    return train_ids, cv_ids, train_labels, cv_labels

train_ids, cv_ids, train_labels, cv_labels = generate_split()

Amount of train labels: 174865, 71308/103557
Amount of cv labels: 45160, 17809/27351
Percentage of cv labels: 0.20524940347687762


In [2]:
df_train = pd.read_csv('data/train_labels.csv').values
# df_train = df_train.sample(n=20, random_state=2019).values
# kf = KFold(n_splits=2000, shuffle=True, random_state=2019)
df_train.shape

In [10]:
batch_size=64
def get_data_generator(train_df, valid_df):
    datagen_train=ImageDataGenerator(
        rescale=1./255.,
        zoom_range=0.1,
        rotation_range=90,
#         zca_whitening=True,
        horizontal_flip=True,
        vertical_flip=True
    )
    datagen_valid=ImageDataGenerator(
#         zca_whitening=True,
        rescale=1./255.
    )
    train_generator = datagen_train.flow_from_dataframe(
        dataframe=train_df,
        directory=r'e:/cancerDetection/train',
        x_col='id',
        y_col='label',
        target_size=(96,96), 
        class_mode='binary', 
        batch_size=batch_size)

    valid_generator = datagen_valid.flow_from_dataframe(
        dataframe=valid_df,
        directory=r'e:/cancerDetection/train',
        x_col='id',
        y_col='label',
        target_size=(96,96), 
        class_mode='binary', 
        batch_size=batch_size)  #每次生成的样本数，注意：generator是无限循环的，需要在fit_generator中指定steps_per_epoch,才能知道一个epoch 什么时候结束
    return train_generator, valid_generator

In [4]:
IN_SHAPE=(96,96,3)
class BatchNormalization(tf.keras.layers.BatchNormalization):
    """
    Make trainable=False freeze BN for real (the og version is sad)
    """

    def call(self, x, training=False):
        if training is None:
            training = tf.constant(False)
        training = tf.logical_and(training, self.trainable)
        return super().call(x, training)
    
def model_bulid():
    inputs = keras.Input(IN_SHAPE)
    conv_base = DenseNet121(
        weights='imagenet',
#         weights=None,
        include_top=False,
        input_shape=IN_SHAPE
    )
    x = conv_base(inputs)
    out = layers.Flatten()(x)
    out = layers.Dropout(0.2)(out)
    out = layers.Dense(512)(out)
    out = BatchNormalization()(out)
    out = layers.Activation(activation='relu')(out)
    out = layers.Dense(1, activation="sigmoid")(out)
    model = keras.Model(inputs, out)

#     conv_base.trainable = False
#     set_trainable = False
#     for layer in conv_base.layers[:]:
#         layer.trainable = False
#     for layer in conv_base.layers[-40:]:
#         layer.trainable = True


    model.compile(keras.optimizers.RMSprop(),
                  loss=tf.losses.BinaryCrossentropy(from_logits=False), metrics=["accuracy"])

    model.summary()
#     plot_model(model, to_file='NetStruct.png', show_shapes=True)
    return model

model = model_bulid()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 96, 96, 3)]       0         
_________________________________________________________________
densenet121 (Model)          (None, 3, 3, 1024)        7037504   
_________________________________________________________________
flatten (Flatten)            (None, 9216)              0         
_________________________________________________________________
dropout (Dropout)            (None, 9216)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               4719104   
_________________________________________________________________
batch_normalization (BatchNo (None, 512)               2048      
_________________________________________________________________
activation (Activation)      (None, 512)               0     

In [5]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 96, 96, 3)]       0         
_________________________________________________________________
densenet121 (Model)          (None, 3, 3, 1024)        7037504   
_________________________________________________________________
flatten (Flatten)            (None, 9216)              0         
_________________________________________________________________
dropout (Dropout)            (None, 9216)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               4719104   
_________________________________________________________________
batch_normalization (BatchNo (None, 512)               2048      
_________________________________________________________________
activation (Activation)      (None, 512)               0     

In [10]:
for layer in model.layers:
#     print(layer.name)
    if layer.name=='densenet121':
        layer.trainable=True
#         print(len(layer.layers))
        for l in layer.layers[0:80]:
#             print(l.trainable)
            l.trainable=False
            

In [6]:
# train, valid = train_test_split(df_train, test_size=0.05, shuffle=True)
train=np.array(list(zip(train_ids, train_labels)))
valid = np.array(list(zip(cv_ids, cv_labels)))
train_df = pd.DataFrame(train, columns=['id','label']).astype('str')
valid_df = pd.DataFrame(valid, columns=['id','label']).astype('str')
train_df.values[:,0]=train_df.values[:,0]+'.tif'
valid_df.values[:,0]=valid_df.values[:,0]+'.tif'

In [7]:
model.compile(keras.optimizers.Adam(learning_rate=0.0001), loss=tf.losses.BinaryCrossentropy(from_logits=False), metrics=["accuracy"])


In [None]:
# tensorboard = TensorBoard(log_dir='./logs',  # log 目录
#                       # histogram_freq=1,  # 按照何等频率（epoch）来计算直方图，0为不计算
#                       # batch_size=batch_size,     # 用多大量的数据计算直方图
#                       write_graph=True,  # 是否存储网络结构图
#                       write_grads=False,  # 是否可视化梯度直方图
#                       write_images=False,  # 是否可视化参数
#                       embeddings_freq=0,
#                       embeddings_layer_names=None,
#                       embeddings_metadata=None)
# model_checkpoint = ModelCheckpoint(
#     'weights_epoch'+'_'+str(num_fold)+'.h5', monitor='val_loss', save_best_only=True)

# reducel = ReduceLROnPlateau(
#     monitor='val_loss', patience=5, verbose=1, factor=0.1)

In [11]:
train_generator, valid_generator = get_data_generator(train_df, valid_df)

Found 174865 images belonging to 2 classes.
Found 45160 images belonging to 2 classes.


In [12]:
earlystopper = EarlyStopping(
    monitor='val_loss', patience=50)

history = model.fit_generator(train_generator,
      validation_data=valid_generator,
      epochs=50,
      callbacks=[earlystopper],
      steps_per_epoch=train_df.shape[0]//(batch_size),
      validation_steps=valid_df.shape[0]//(batch_size)
     )

# plt.plot(history.history['loss'], label='train')
# plt.plot(history.history['val_loss'], label='valid')
# plt.title("model loss")
# plt.ylabel("loss")
# plt.xlabel("epoch")
# plt.legend(["train", "valid"], loc="upper left")
# plt.savefig('loss_performance'+'.png')
# plt.clf()
# plt.plot(history.history['acc'], label='train')
# plt.plot(history.history['val_acc'], label='valid')
# plt.title("model acc")
# plt.ylabel("acc")
# plt.xlabel("epoch")
# plt.legend(["train", "valid"], loc="upper left")
# plt.savefig('acc_performance'+'.png')

# with open('logs_kfold.txt', 'a+') as f:
#     f.write(str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"))+'\n')
# #     f.write(str(num_fold)+'\n')
#     f.write(str(history.history['loss'])+'\n')
#     f.write(str(history.history['val_loss'])+'\n')
#     f.write(str(history.history['acc'])+'\n')
#     f.write(str(history.history['val_acc'])+'\n')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50

KeyboardInterrupt: 

In [3]:
model.load_weights('model.h5')

In [24]:
import numpy as np
from glob import glob
from skimage.io import imread
testing_files = glob(os.path.join('data/test/','*.tif'))
submission = pd.DataFrame()
for index in range(0, len(testing_files), 128):
    data_frame = pd.DataFrame({'path': testing_files[index:index+128]})
    data_frame['id'] = data_frame.path.map(lambda x: x.split('/')[1].split(".")[0])
    data_frame['image'] = data_frame['path'].map(imread)
    images = np.stack(data_frame.image, axis=0)
    predicted_labels = [model.predict(np.expand_dims(image, axis=0),verbose=0)[0][0] for image in images]
    predictions = np.array(predicted_labels)
    data_frame['label'] = predictions
    submission = pd.concat([submission, data_frame[["id", "label"]]])
submission.to_csv('result.csv', index=False, header=True)

In [28]:
submission['label']=submission['label'].map(lambda x: int(np.round(x)))
submission.id=submission.id.map(lambda x: x.split('\\')[1])
submission.to_csv('result.csv', index=False, header=True)

In [29]:
submission.head()

Unnamed: 0,id,label
0,00006537328c33e284c973d7b39d340809f7271b,1
1,0000ec92553fda4ce39889f9226ace43cae3364e,1
2,00024a6dee61f12f7856b0fc6be20bc7a48ba3d2,0
3,000253dfaa0be9d0d100283b22284ab2f6b643f6,0
4,000270442cc15af719583a8172c87cd2bd9c7746,0


In [13]:
test_datagen=ImageDataGenerator(
#     zca_whitening=True,
    rescale=1./255.
)

test_generator = test_datagen.flow_from_directory(
    directory=r'E:/cancerDetection/test',
    target_size=(96,96), 
    batch_size=2048,
    class_mode=None,
    shuffle=False
) 

Found 57458 images belonging to 1 classes.


In [None]:
submission = pd.DataFrame()

for i in test_generator:

    data_frame = pd.DataFrame()
    
    idx = (test_generator.batch_index - 1) * test_generator.batch_size
    if test_generator.batch_index == 0:
        idx = 57344
    print(test_generator.batch_index, idx)
    files = test_generator.filenames[idx : (idx + test_generator.batch_size) if (idx + test_generator.batch_size<=57458) else 57458] #
    print(len(files))
    print(files[0], files[-1])
    data_frame['id']=np.array([x.split('.')[0][2:] for x in files])
    predictions = model.predict(i,verbose=0)
    print(len(predictions))
    data_frame['label'] = np.array([int(x) for x in list(map(np.rint, predictions))])
    submission = pd.concat([submission, data_frame])
    if test_generator.batch_index == 0:
        break

submission.to_csv('result.csv', index=False, header=True)

1 0
2048
1\00006537328c33e284c973d7b39d340809f7271b.tif 1\0912a4f265fe7f37be287c4333167e59162cccba.tif
2048
2 2048
2048
1\0912be906eeb7f21a21c9378c52640411c63fe92.tif 1\126bf75fb455b0ead4c791b8616f91a7d7e0a23d.tif
2048
