In [1]:
import pandas as pd
import numpy as np
# from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
import numpy as np

def return_tumor_or_not(dic, one_id):
    return dic[one_id]

def create_dict():
    df = pd.read_csv("data/train_labels.csv")
    result_dict = {}
    for index in range(df.shape[0]):
        one_id = df.iloc[index,0]
        tumor_or_not = df.iloc[index,1]
        result_dict[one_id] = int(tumor_or_not)
    return result_dict

def find_missing(train_ids, cv_ids):
    all_ids = set(pd.read_csv("data/train_labels.csv")['id'].values)
    wsi_ids = set(train_ids + cv_ids)

    missing_ids = list(all_ids-wsi_ids)
    return missing_ids


def generate_split():
    ids = pd.read_csv("data/patch_id_wsi.csv")
    wsi_dict = {}
    for i in range(ids.shape[0]):
        wsi = ids.iloc[i,1]
        train_id = ids.iloc[i,0]
        wsi_array = wsi.split('_')
        number = int(wsi_array[3])
        if wsi_dict.get(number) is None:
            wsi_dict[number] = [train_id]
        else:
            wsi_dict[number].append(train_id)

    wsi_keys = list(wsi_dict.keys())
    np.random.seed()
    np.random.shuffle(wsi_keys)
    amount_of_keys = len(wsi_keys)

    keys_for_train = wsi_keys[0:int(amount_of_keys*0.8)]
    keys_for_cv = wsi_keys[int(amount_of_keys*0.8):]
    train_ids = []
    cv_ids = []

    for key in keys_for_train:
        train_ids += wsi_dict[key]

    for key in keys_for_cv:
        cv_ids += wsi_dict[key]

    dic = create_dict()

    missing_ids = find_missing(train_ids, cv_ids)
    missing_ids_total = len(missing_ids)
    np.random.seed()
    np.random.shuffle(missing_ids)

    train_missing_ids = missing_ids[0:int(missing_ids_total*0.8)]
    cv_missing_ids = missing_ids[int(missing_ids_total*0.8):]

    train_ids += train_missing_ids
    cv_ids += cv_missing_ids

    train_labels = []
    cv_labels = []

    train_tumor = 0
    for one_id in train_ids:
        temp = return_tumor_or_not(dic, one_id)
        train_tumor += temp
        train_labels.append(temp)

    cv_tumor = 0
    for one_id in cv_ids:
        temp = return_tumor_or_not(dic, one_id)
        cv_tumor += temp
        cv_labels.append(temp)
    total = len(train_ids) + len(cv_ids)

    print("Amount of train labels: {}, {}/{}".format(len(train_ids), train_tumor, len(train_ids)-train_tumor))
    print("Amount of cv labels: {}, {}/{}".format(len(cv_ids), cv_tumor, len(cv_ids) - cv_tumor))
    print("Percentage of cv labels: {}".format(len(cv_ids)/total))

    return train_ids, cv_ids, train_labels, cv_labels

train_ids, cv_ids, train_labels, cv_labels = generate_split()

Amount of train labels: 171763, 68937/102826
Amount of cv labels: 48262, 20180/28082
Percentage of cv labels: 0.2193478013862061


In [3]:
import  tensorflow as tf
from    tensorflow import keras
from    tensorflow.keras import layers, Sequential
from    tensorflow.keras.callbacks import EarlyStopping 

In [4]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [5]:
batch_size=64
def get_data_generator(train_df, valid_df):
    datagen_train=ImageDataGenerator(
        rescale=1./255.,
        zoom_range=0.1,
        rotation_range=90,
        zca_whitening=True,
        horizontal_flip=True,
        vertical_flip=True
    )
    datagen_valid=ImageDataGenerator(
        zca_whitening=True,
        rescale=1./255.
    )
    train_generator = datagen_train.flow_from_dataframe(
        dataframe=train_df,
        directory=r'e:/cancerDetection/train',
        x_col='id',
        y_col='label',
        target_size=(96,96), 
        class_mode='binary', 
        batch_size=batch_size)

    valid_generator = datagen_valid.flow_from_dataframe(
        dataframe=valid_df,
        directory=r'e:/cancerDetection/train',
        x_col='id',
        y_col='label',
        target_size=(96,96), 
        class_mode='binary', 
        batch_size=batch_size)  #每次生成的样本数，注意：generator是无限循环的，需要在fit_generator中指定steps_per_epoch,才能知道一个epoch 什么时候结束
    return train_generator, valid_generator

In [6]:
class BatchNormalization(tf.keras.layers.BatchNormalization):
    """
    Make trainable=False freeze BN for real (the og version is sad)
    """

    def call(self, x, training=False):
        if training is None:
            training = tf.constant(False)
        training = tf.logical_and(training, self.trainable)
        return super().call(x, training)

In [7]:
class BasicBlock(layers.Layer):

    def __init__(self, filter_num, stride=1, use_dropout=False):
        super(BasicBlock, self).__init__()
        self.use_dropout = use_dropout
        self.conv1 = layers.Conv2D(filter_num, (3, 3), strides=stride, padding='same', kernel_initializer=keras.initializers.he_normal())
        self.bn1 = BatchNormalization()
        self.dropout1 = layers.Dropout(0.2)
        self.relu = layers.Activation('relu')

        self.conv2 = layers.Conv2D(filter_num, (3, 3), strides=1, padding='same', kernel_initializer=keras.initializers.he_normal())
        self.bn2 = BatchNormalization()
        self.dropout2 = layers.Dropout(0.2)

        if stride != 1:
            self.downsample = Sequential()
            self.downsample.add(layers.Conv2D(filter_num, (1, 1), strides=stride, kernel_initializer=keras.initializers.he_normal()))
        else:
            self.downsample = lambda x:x



    def call(self, inputs, training=None):

        # [b, h, w, c]
        out = self.conv1(inputs)
        out = self.bn1(out)
        if self.use_dropout:
            out = self.dropout1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        if self.use_dropout:
            out = self.dropout2(out)

        identity = self.downsample(inputs)

        output = layers.add([out, identity])
        output = tf.nn.relu(output)

        return output


class ResNet(keras.Model):


    def __init__(self, layer_dims, num_classes=100): # [2, 2, 2, 2]
        super(ResNet, self).__init__()

        self.stem = Sequential([layers.Conv2D(64, (3, 3), strides=(1, 1)),
                                BatchNormalization(),
                                layers.Activation('relu'),
                                layers.MaxPool2D(pool_size=(2, 2), strides=(1, 1), padding='same')
                                ])

        self.layer1 = self.build_resblock(64,  layer_dims[0])
        self.layer2 = self.build_resblock(128, layer_dims[1], stride=2)
        self.layer3 = self.build_resblock(256, layer_dims[2], stride=2)
        self.layer4 = self.build_resblock(256, layer_dims[3], stride=2, use_dropout=True)

        # output: [b, 512, h, w],
        self.avgpool = layers.GlobalAveragePooling2D()
        self.dropout = layers.Dropout(0.2)
        self.fc = layers.Dense(num_classes, activation='sigmoid')


    def call(self, inputs, training=None):

        x = self.stem(inputs)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        # [b, c]
        x = self.avgpool(x)
        x = self.dropout(x)
        # [b, 100]
        x = self.fc(x)

        return x



    def build_resblock(self, filter_num, blocks, stride=1, use_dropout=False):

        res_blocks = Sequential()
        # may down sample
        res_blocks.add(BasicBlock(filter_num, stride, use_dropout=use_dropout))

        for _ in range(1, blocks):
            res_blocks.add(BasicBlock(filter_num, stride=1, use_dropout=use_dropout))

        return res_blocks


def resnet18(num_classes=100):
    return ResNet([2, 2, 2, 2],num_classes=num_classes)


def resnet34():
    return ResNet([3, 4, 6, 3])

In [8]:
model = resnet18(num_classes=1)
model.build(input_shape=(None, 96, 96, 3))

In [9]:
model.summary()

Model: "res_net"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential (Sequential)      multiple                  2048      
_________________________________________________________________
sequential_1 (Sequential)    multiple                  148736    
_________________________________________________________________
sequential_2 (Sequential)    multiple                  526976    
_________________________________________________________________
sequential_4 (Sequential)    multiple                  2102528   
_________________________________________________________________
sequential_6 (Sequential)    multiple                  2430208   
_________________________________________________________________
global_average_pooling2d (Gl multiple                  0         
_________________________________________________________________
dropout_16 (Dropout)         multiple                  0   

In [10]:
model.compile(keras.optimizers.Adam(learning_rate=0.001), loss=tf.losses.BinaryCrossentropy(from_logits=False), metrics=["accuracy"])

In [11]:
model.load_weights('resnet')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x23e79ca7be0>

In [12]:
# train, valid = train_test_split(df_train, test_size=0.05, shuffle=True)
train=np.array(list(zip(train_ids, train_labels)))
valid = np.array(list(zip(cv_ids, cv_labels)))
train_df = pd.DataFrame(train, columns=['id','label']).astype('str')
valid_df = pd.DataFrame(valid, columns=['id','label']).astype('str')
train_df.values[:,0]=train_df.values[:,0]+'.tif'
valid_df.values[:,0]=valid_df.values[:,0]+'.tif'
train_generator, valid_generator = get_data_generator(train_df, valid_df)
print(train_df.shape)



Found 171763 images belonging to 2 classes.
Found 48262 images belonging to 2 classes.
(171763, 2)


In [13]:
earlystopper = EarlyStopping(
    monitor='val_loss', patience=10)
    
history = model.fit_generator(train_generator,
      validation_data=valid_generator,
      epochs=12,
      callbacks=[earlystopper],
      steps_per_epoch=train_df.shape[0]//batch_size,
      validation_steps=valid_df.shape[0]//batch_size
     )

Epoch 1/12






KeyboardInterrupt: 

In [21]:
model.save_weights('resnet')

In [10]:
data = next(iter(train_generator))
print(model(data[0]))
print(data[1])

NameError: name 'train_generator' is not defined

In [27]:
sample_image=tf.random.normal([1,96,96,3], mean=128, stddev=10)
model(sample_image)

In [18]:
test_datagen=ImageDataGenerator(
    zca_whitening=True,
    rescale=1./255.
)

test_generator = test_datagen.flow_from_directory(
    directory=r'C:/test',
    target_size=(96,96), 
    batch_size=2048,
    class_mode=None,
    shuffle=False
) 

Found 57458 images belonging to 1 classes.


In [19]:
submission = pd.DataFrame()

for i in test_generator:

    data_frame = pd.DataFrame()
    
    idx = (test_generator.batch_index - 1) * test_generator.batch_size
    if test_generator.batch_index == 0:
        idx = 57344
    print(test_generator.batch_index, idx)
    files = test_generator.filenames[idx : (idx + test_generator.batch_size) if (idx + test_generator.batch_size<=57458) else 57458] #
    print(len(files))
    print(files[0], files[-1])
    data_frame['id']=np.array([x.split('.')[0][2:] for x in files])
    predictions = model.predict(i,verbose=0)
    print(len(predictions))
    data_frame['label'] = np.array([int(x) for x in list(map(np.rint, predictions))])
    submission = pd.concat([submission, data_frame])
    if test_generator.batch_index == 0:
        break

submission.to_csv('result.csv', index=False, header=True)

1 0
2048
1\00006537328c33e284c973d7b39d340809f7271b.tif 1\0912a4f265fe7f37be287c4333167e59162cccba.tif
2048
2 2048
2048
1\0912be906eeb7f21a21c9378c52640411c63fe92.tif 1\126bf75fb455b0ead4c791b8616f91a7d7e0a23d.tif
2048
3 4096
2048
1\126cb84577113e79c20ada4488b529e386259df3.tif 1\1b608be7479cf8442c033320a0d449dbad9d7c8f.tif
2048
4 6144
2048
1\1b6159e3e7385d1a936a41f6b85257fbc41ca47d.tif 1\24d6cc1260f47d5cab369227e2b07b1b234f0f81.tif
2048
5 8192
2048
1\24d765206aacf9435972fc93c7787864182edfe7.tif 1\2d89e0f298d15eb26686c2556da95cc3abb80be7.tif
2048
6 10240
2048
1\2d8b7b1e36fb276ef58888926d83aa9f14275877.tif 1\36a510f3444a7641a312d6f0b5565ca932a9934a.tif
2048
7 12288
2048
1\36a6027aad62d423a759fdfc72331af0fedff430.tif 1\3fcbb4a94606d3e281ebf11d7075cdba2fecf96d.tif
2048
8 14336
2048
1\3fcbdf438c17243900cbb3a59fd9d4b3bd7d028e.tif 1\48e90d732baf86e86d3ef1a3e11036e264ad6fb3.tif
2048
9 16384
2048
1\48e926fbd0ac80b3318ba1ec5c61555254d2952f.tif 1\52007e2b1dbf12c81ef93c93b276cad090d2e512.tif
2048


In [121]:
print(test_generator.batch_index, submission.shape)

In [124]:
submission=submission.drop_duplicates(["id"])

In [48]:
import numpy as np
from glob import glob
from skimage.io import imread
import os
testing_files = glob(os.path.join('data/test/','*.tif'))
submission = pd.DataFrame()

for index in range(0, len(testing_files), 2048):
    data_frame = pd.DataFrame({'path': testing_files[index:index+2048]})
    data_frame['id'] = data_frame.path.map(lambda x: x.split(".")[0][10:])
    data_frame['image'] = data_frame['path'].map(imread)
    images = np.stack(data_frame.image, axis=0)
    images = images / 255.
    print(index)
    predicted_labels = np.rint(model.predict(images,verbose=0))
    data_frame['label'] = predicted_labels
    submission = pd.concat([submission, data_frame[["id", "label"]]])
submission.to_csv('result.csv', index=False, header=True)

0
2048
4096
6144
8192
10240
12288
14336
16384
18432
20480
22528
24576
26624
28672
30720
32768
34816
36864
38912
40960
43008
45056
47104
49152
51200
53248
55296
57344


In [101]:
a=[1,2,3,4]
print(a[0:(1+2) if 2<3 else 2])

[1, 2, 3]
