In [1]:
import os
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator

from keras.applications.resnet50 import ResNet50
from keras.applications.densenet import DenseNet121, DenseNet201
from keras.models import Model
from keras.layers import Dense, Flatten, Dropout, Activation, BatchNormalization, GlobalAveragePooling2D, GlobalMaxPooling2D, Concatenate, Input
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TensorBoard, ModelCheckpoint
from keras.utils.vis_utils import plot_model
from sklearn.model_selection import KFold

Using TensorFlow backend.


In [2]:
df_train = pd.read_csv('data/train_labels.csv').values
# df_train = df_train.sample(n=20, random_state=2019).values
# kf = KFold(n_splits=2000, shuffle=True, random_state=2019)

In [3]:
df_train.shape

(220025, 2)

In [4]:
batch_size=32
def get_data_generator(train_df, valid_df):
    datagen=ImageDataGenerator(
        rescale=1./255
    )
    train_generator = datagen.flow_from_dataframe(
        dataframe=train_df,
        directory='data/train',
        x_col='id',
        y_col='label',
        target_size=(96,96), 
        class_mode='binary', 
        batch_size=batch_size)

    valid_generator = datagen.flow_from_dataframe(
        dataframe=valid_df,
        directory='data/train',
        x_col='id',
        y_col='label',
        target_size=(96,96), 
        class_mode='binary', 
        batch_size=batch_size)  #每次生成的样本数，注意：generator是无限循环的，需要在fit_generator中指定steps_per_epoch,才能知道一个epoch 什么时候结束
    return train_generator, valid_generator

In [11]:
IN_SHAPE=(96,96,3)

def model_bulid():
    inputs = Input(IN_SHAPE)
    conv_base = DenseNet201(
#         weights='imagenet',
        weights=None,
        include_top=False,
        input_shape=IN_SHAPE
    )
    x = conv_base(inputs)
    out = Flatten()(x)
    # **如果换成256验证集效果很差，我也不知道为什么**
    out = Dense(512)(out)
    out = BatchNormalization()(out)
    out = Activation(activation='relu')(out)
#     out = Dropout(dropout_rate)(out)
    out = Dense(1, activation="sigmoid")(out)
    model = Model(inputs, out)

#     conv_base.trainable = False
#     set_trainable = False
    for layer in conv_base.layers[-50:]:
        layer.trainable = True
    for layer in conv_base.layers[:-50]:
        layer.trainable = False


    model.compile(Adam(),
                  loss="binary_crossentropy", metrics=["accuracy"])

    model.summary()
#     plot_model(model, to_file='NetStruct.png', show_shapes=True)
    return model

model = model_bulid()

In [27]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 96, 96, 3)         0         
_________________________________________________________________
densenet201 (Model)          (None, 3, 3, 1920)        18321984  
_________________________________________________________________
flatten_2 (Flatten)          (None, 17280)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               8847872   
_________________________________________________________________
batch_normalization_2 (Batch (None, 512)               2048      
_________________________________________________________________
activation_2 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 513       
Total para

In [26]:
for layer in model.layers:
    if layer.name=='densenet201':
        layer.trainable=True
#         print(len(layer.layers))
        for l in layer.layers[-51:]:
            l.trainable=True
            

In [30]:
train, valid = train_test_split(df_train, test_size=0.25, shuffle=True)

train_df = pd.DataFrame(train, columns=['id','label']).astype('str')
valid_df = pd.DataFrame(valid, columns=['id','label']).astype('str')
train_df.values[:,0]=train_df.values[:,0]+'.tif'
valid_df.values[:,0]=valid_df.values[:,0]+'.tif'
train_generator, valid_generator = get_data_generator(train_df, valid_df)
    
# tensorboard = TensorBoard(log_dir='./logs',  # log 目录
#                       # histogram_freq=1,  # 按照何等频率（epoch）来计算直方图，0为不计算
#                       # batch_size=batch_size,     # 用多大量的数据计算直方图
#                       write_graph=True,  # 是否存储网络结构图
#                       write_grads=False,  # 是否可视化梯度直方图
#                       write_images=False,  # 是否可视化参数
#                       embeddings_freq=0,
#                       embeddings_layer_names=None,
#                       embeddings_metadata=None)
# model_checkpoint = ModelCheckpoint(
#     'weights_epoch'+'_'+str(num_fold)+'.h5', monitor='val_loss', save_best_only=True)
earlystopper = EarlyStopping(
    monitor='val_loss', patience=5, verbose=1)
reducel = ReduceLROnPlateau(
    monitor='val_loss', patience=5, verbose=1, factor=0.1)

history = model.fit_generator(train_generator,
                              validation_data=valid_generator,
                              epochs=7,
#                               callbacks=[reducel, earlystopper],
#                                          , tensorboard, model_checkpoint],
#                                   steps_per_epoch=train_df.shape[0]/batch_size,
#                                   validation_steps=valid_df.shape[0]/batch_size

                              steps_per_epoch=train_df.shape[0]//batch_size,
                              validation_steps=valid_df.shape[0]//batch_size
                             )

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='valid')
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "valid"], loc="upper left")
plt.savefig('loss_performance'+'.png')
plt.clf()
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='valid')
plt.title("model acc")
plt.ylabel("acc")
plt.xlabel("epoch")
plt.legend(["train", "valid"], loc="upper left")
plt.savefig('acc_performance'+'.png')

with open('logs_kfold.txt', 'a+') as f:
    f.write(str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"))+'\n')
#     f.write(str(num_fold)+'\n')
    f.write(str(history.history['loss'])+'\n')
    f.write(str(history.history['val_loss'])+'\n')
    f.write(str(history.history['acc'])+'\n')
    f.write(str(history.history['val_acc'])+'\n')

Found 165018 images belonging to 2 classes.
Found 55007 images belonging to 2 classes.


  'Discrepancy between trainable weights and collected trainable'


Epoch 1/7
Epoch 2/7
Epoch 3/7
 552/5156 [==>...........................] - ETA: 55:05 - loss: 0.1178 - acc: 0.9539

KeyboardInterrupt: 

In [22]:
num_fold=0
for train_index, test_index in kf.split(df_train):
    num_fold+=1
#     print("TRAIN:", train_index, "TEST:", test_index)
    train_df = pd.DataFrame(df_train[train_index], columns=['id','label']).astype('str')
    valid_df = pd.DataFrame(df_train[test_index], columns=['id','label']).astype('str')
    train_df.values[:,0]=train_df.values[:,0]+'.tif'
    valid_df.values[:,0]=valid_df.values[:,0]+'.tif'
    
    train_generator, valid_generator = get_data_generator(train_df, valid_df)
    
    tensorboard = TensorBoard(log_dir='./logs_kfold',  # log 目录
                          # histogram_freq=1,  # 按照何等频率（epoch）来计算直方图，0为不计算
                          # batch_size=batch_size,     # 用多大量的数据计算直方图
                          write_graph=True,  # 是否存储网络结构图
                          write_grads=False,  # 是否可视化梯度直方图
                          write_images=False,  # 是否可视化参数
                          embeddings_freq=0,
                          embeddings_layer_names=None,
                          embeddings_metadata=None)
    model_checkpoint = ModelCheckpoint(
        'weights_kfold'+'_'+str(num_fold)+'.h5', monitor='val_loss', save_best_only=True)
    earlystopper = EarlyStopping(
        monitor='val_loss', patience=30, verbose=1)
    reducel = ReduceLROnPlateau(
        monitor='val_loss', patience=30, verbose=1, factor=0.1)

    history = model.fit_generator(train_generator,
                                  validation_data=valid_generator,
                                  epochs=30,
                                  callbacks=[reducel, earlystopper, tensorboard, model_checkpoint],
#                                   steps_per_epoch=train_df.shape[0]/batch_size,
#                                   validation_steps=valid_df.shape[0]/batch_size
                                  steps_per_epoch=batch_size,
                                  validation_steps=2
                                 )

    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='valid')
    plt.title("model loss")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.legend(["train", "valid"], loc="upper left")
    plt.savefig('loss_performance'+'_'+str(num_fold)+'.png')
    plt.clf()
    plt.plot(history.history['acc'], label='train')
    plt.plot(history.history['val_acc'], label='valid')
    plt.title("model acc")
    plt.ylabel("acc")
    plt.xlabel("epoch")
    plt.legend(["train", "valid"], loc="upper left")
    plt.savefig('acc_performance'+'_'+str(num_fold)+'.png')

    with open('logs_kfold.txt', 'a+') as f:
        f.write(str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"))+'\n')
        f.write(str(num_fold)+'\n')
        f.write(str(history.history['loss'])+'\n')
        f.write(str(history.history['val_loss'])+'\n')
        f.write(str(history.history['acc'])+'\n')
        f.write(str(history.history['val_acc'])+'\n')
    if num_fold>10:
        break

KeyboardInterrupt: 

In [10]:
model.save('model.h5')

In [3]:
model.load_weights('model.h5')

In [24]:
import numpy as np
from glob import glob
from skimage.io import imread
testing_files = glob(os.path.join('data/test/','*.tif'))
submission = pd.DataFrame()
for index in range(0, len(testing_files), 128):
    data_frame = pd.DataFrame({'path': testing_files[index:index+128]})
    data_frame['id'] = data_frame.path.map(lambda x: x.split('/')[1].split(".")[0])
    data_frame['image'] = data_frame['path'].map(imread)
    images = np.stack(data_frame.image, axis=0)
    predicted_labels = [model.predict(np.expand_dims(image, axis=0),verbose=0)[0][0] for image in images]
    predictions = np.array(predicted_labels)
    data_frame['label'] = predictions
    submission = pd.concat([submission, data_frame[["id", "label"]]])
submission.to_csv('result.csv', index=False, header=True)

In [28]:
submission['label']=submission['label'].map(lambda x: int(np.round(x)))
submission.id=submission.id.map(lambda x: x.split('\\')[1])
submission.to_csv('result.csv', index=False, header=True)

In [29]:
submission.head()

Unnamed: 0,id,label
0,00006537328c33e284c973d7b39d340809f7271b,1
1,0000ec92553fda4ce39889f9226ace43cae3364e,1
2,00024a6dee61f12f7856b0fc6be20bc7a48ba3d2,0
3,000253dfaa0be9d0d100283b22284ab2f6b643f6,0
4,000270442cc15af719583a8172c87cd2bd9c7746,0


In [5]:
A=[1,2,3,4,5]

In [10]:
A[:-2]

[1, 2, 3]