In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

In [3]:
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *

In [4]:
from dataset_spliter import SplitByPatient
from metrics import *#F1Weighted, MCC
from losses import *

In [5]:
import pretrainedmodels

In [6]:
path = Path('/data/Datasets/WhiteBloodCancer/train/')

In [7]:
np.random.seed(42)

In [8]:
fnames = get_image_files(path, recurse=True)
fnames[:5]

[PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_43_1_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H22_31_15_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_9_11_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_28_6_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_189_1_hem.bmp')]

In [9]:
train_files_regex = re.compile(r'(fold_0|fold_1|fold_2)')

fnames = [fn for fn in fnames if train_files_regex.search(str(fn)) is not None]
len(fnames)

10661

In [10]:
hem_regex = re.compile(r'UID_(H[0-9]+)_', re.IGNORECASE)
all_regex = re.compile(r'UID_([0-9]+)_', re.IGNORECASE)

In [11]:
hem_patient_ids = list(set([hem_regex.search(str(fn)).group(1)
                            for fn in fnames if hem_regex.search(str(fn)) is not None]))
all_patint_ids = list(set([all_regex.search(str(fn)).group(1)
                           for fn in fnames if all_regex.search(str(fn)) is not None]))

hem_patients = dict((k,[]) for k in hem_patient_ids)
all_patints = dict((k,[]) for k in all_patint_ids)

[all_patints[key].append(fn) for key in all_patints.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
[hem_patients[key].append(fn) for key in hem_patients.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
print()




## Split data into train val 

In [12]:
split_handler = SplitByPatient(hem_patients, all_patints)

### Split by regex

In [13]:
hem_train, all_train, hem_val, all_val = split_handler.random_split(0.1)

In [14]:
print('Train Total: {0}'.format(len(hem_train)+len(all_train)))
print('Val Total: {0}'.format(len(hem_val)+len(all_val)))
print("")
print('Hem train: {}'.format(len(hem_train)))
print('All train: {}'.format(len(all_train)))
print('Hem val: {}'.format(len(hem_val)))
print('All val: {}'.format(len(all_val)))

Train Total: 9596
Val Total: 1065

Hem train: 3051
All train: 6545
Hem val: 338
All val: 727


In [15]:
pat = re.compile(r'^.*(hem|all).bmp$')

def get_label(fn):
    return pat.search(str(fn)).group(1)

### Use complete image

In [16]:
train = ImageList(hem_train + all_train) #optinal scale up classes 
valid = ImageList(hem_val + all_val)

In [17]:
item_list = ItemLists(path, train, valid)
lls = item_list.label_from_func(get_label).add_test_folder('../test')

#### Data augmentation

In [18]:
xtra_tfms=[cutout(n_holes=5, length=0.2)]#squish(scale=0.66), 
tfms = get_transforms(do_flip=True, 
                      flip_vert=True, 
                      #max_rotate=90,  
                      max_lighting=0.05, 
                      max_zoom=1.05, 
                      max_warp=0.05,
                      #p_affine=0.75,
                      #p_lighting=0.75,  
                      #xtra_tfms=xtra_tfms,
                     )

#### Create dataset 

In [25]:
import pandas as pd

file = "/server/born_pix/EPA_DATASETS/WhiteBloodCancer/VAL_ISBI_labelfile_Source_reference_prediction.csv"
dataset = pd.read_csv(file, delimiter=';')
gt_labels = np.array(dataset.loc[:, 'labels'])

In [19]:
def get_data(bs, size):
    data  = ImageDataBunch.create_from_ll(lls, size=size, bs=bs, 
                                      ds_tfms=tfms, padding_mode='zeros',
                                      resize_method=ResizeMethod.PAD)
    data = data.normalize()
    #data = data.normalize((channel_mean, channel_std))
    return data
    

In [20]:
print(pretrainedmodels.model_names)

['fbresnet152', 'bninception', 'resnext101_32x4d', 'resnext101_64x4d', 'inceptionv4', 'inceptionresnetv2', 'alexnet', 'densenet121', 'densenet169', 'densenet201', 'densenet161', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'inceptionv3', 'squeezenet1_0', 'squeezenet1_1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19_bn', 'vgg19', 'nasnetamobile', 'nasnetalarge', 'dpn68', 'dpn68b', 'dpn92', 'dpn98', 'dpn131', 'dpn107', 'xception', 'senet154', 'se_resnet50', 'se_resnet101', 'se_resnet152', 'se_resnext50_32x4d', 'se_resnext101_32x4d', 'cafferesnet101', 'pnasnet5large', 'polynet']


In [21]:
def get_cadene_model(pretrained=True, model_name='se_resnext50_32x4d'):
    if pretrained:
        arch = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet')
    else:
        arch = pretrainedmodels.__dict__[model_name](num_classes=1000)
    return arch

In [22]:
size = 256
bs = 64
data = get_data(bs, size)

experiment_name = "baseline_se_resnext50_32x4d"
learn = create_cnn(data, get_cadene_model, 
                    cut=-2,
                       metrics=[error_rate, F1Weighted(), MCC()], #  
                       #loss_func=FocalLoss(num_classes=1),
                       #ps=0.75,
                       #wd=0.1,
                       loss_func = LabelSmoothingCrossEntropy(),
                       callback_fns=[partial(SaveModelCallback, name='stage1-{}-{}'.format(experiment_name, size))],

                  )#

  warn("`create_cnn` is deprecated and is now named `cnn_learner`.")


In [23]:
learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.554767,0.493320,0.132394,0.866894,0.691550,02:09
1,0.472411,0.447145,0.096714,0.902598,0.774290,02:05
2,0.440756,0.418130,0.075117,0.923752,0.824208,02:07
3,0.416214,0.404563,0.067606,0.931636,0.842089,02:09
4,0.402099,0.399561,0.061972,0.937614,0.855668,02:09


Better model found at epoch 0 with val_loss value: 0.4933198392391205.
Better model found at epoch 1 with val_loss value: 0.44714489579200745.
Better model found at epoch 2 with val_loss value: 0.41812989115715027.
Better model found at epoch 3 with val_loss value: 0.404563307762146.
Better model found at epoch 4 with val_loss value: 0.3995606005191803.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.396744,0.398069,0.057277,0.942415,0.866787,02:44
1,0.397794,0.396890,0.057277,0.942415,0.866787,02:42
2,0.394189,0.392994,0.054460,0.944720,0.873342,02:42
3,0.388349,0.389464,0.053521,0.946371,0.876039,02:41
4,0.383622,0.384978,0.051643,0.947989,0.879800,02:38
5,0.375982,0.383964,0.045070,0.954629,0.895200,02:37
6,0.375897,0.377911,0.041315,0.958336,0.903993,02:37
7,0.367388,0.376278,0.040376,0.959337,0.906178,02:37
8,0.366294,0.376219,0.041315,0.958481,0.904047,02:37
9,0.363150,0.375137,0.038498,0.961159,0.910625,02:37


Better model found at epoch 0 with val_loss value: 0.39806854724884033.
Better model found at epoch 1 with val_loss value: 0.39688971638679504.
Better model found at epoch 2 with val_loss value: 0.3929944336414337.
Better model found at epoch 3 with val_loss value: 0.3894641697406769.
Better model found at epoch 4 with val_loss value: 0.3849775195121765.
Better model found at epoch 5 with val_loss value: 0.3839643597602844.
Better model found at epoch 6 with val_loss value: 0.37791141867637634.
Better model found at epoch 7 with val_loss value: 0.37627774477005005.
Better model found at epoch 8 with val_loss value: 0.37621915340423584.
Better model found at epoch 9 with val_loss value: 0.37513747811317444.


In [26]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Test, scale=1.)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)

submission = [0 for i in range(y_pred.shape[0])]

for fn, y in zip(learn.data.test_dl.items, np.argmax(y_pred[:, [1,0]], axis=1)):
    index = int(fn.name.replace(".bmp", '')) - 1
    submission[index] = y
    
f1_score(gt_labels, submission, average='weighted')

0.8163285725129181

In [27]:
size = 384
bs = 32
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.400238,0.395123,0.057277,0.942963,0.869194,04:53
1,0.419313,0.407370,0.071362,0.929166,0.838635,04:42
2,0.397846,0.377491,0.042254,0.957556,0.901887,04:46
3,0.369885,0.366969,0.033803,0.966197,0.921986,04:46
4,0.365285,0.372941,0.029108,0.970833,0.932610,04:45


Better model found at epoch 0 with val_loss value: 0.3951234519481659.
Better model found at epoch 2 with val_loss value: 0.3774907886981964.
Better model found at epoch 3 with val_loss value: 0.36696913838386536.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.371726,0.371817,0.023474,0.976459,0.945632,06:21
1,0.362753,0.357344,0.023474,0.976498,0.945715,06:18
2,0.364655,0.355917,0.026291,0.973645,0.939113,06:18
3,0.360355,0.355541,0.018779,0.981221,0.956659,06:13
4,0.363302,0.355738,0.024413,0.975507,0.943441,06:13
5,0.360268,0.355879,0.023474,0.976420,0.945621,06:13
6,0.357832,0.352685,0.022535,0.977373,0.947802,06:17
7,0.352215,0.349440,0.016901,0.983044,0.960891,06:16
8,0.351013,0.351054,0.017840,0.982079,0.958742,06:16
9,0.348380,0.349167,0.016901,0.983044,0.960891,06:17


Better model found at epoch 0 with val_loss value: 0.37181660532951355.
Better model found at epoch 1 with val_loss value: 0.3573440909385681.
Better model found at epoch 2 with val_loss value: 0.35591748356819153.
Better model found at epoch 3 with val_loss value: 0.3555409014225006.
Better model found at epoch 6 with val_loss value: 0.35268494486808777.
Better model found at epoch 7 with val_loss value: 0.34943994879722595.
Better model found at epoch 9 with val_loss value: 0.3491666615009308.


In [28]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Test, scale=1.)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)

submission = [0 for i in range(y_pred.shape[0])]

for fn, y in zip(learn.data.test_dl.items, np.argmax(y_pred[:, [1,0]], axis=1)):
    index = int(fn.name.replace(".bmp", '')) - 1
    submission[index] = y
    
f1_score(gt_labels, submission, average='weighted')

0.8312111121405913

In [29]:
size = 450
bs = 16
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.396716,0.421959,0.062911,0.935308,0.855424,06:39
1,0.425696,0.431918,0.070423,0.930569,0.845718,06:30
2,0.410644,0.369933,0.029108,0.970785,0.932519,06:31
3,0.386140,0.388634,0.028169,0.971716,0.934693,06:32
4,0.369727,0.357343,0.023474,0.976400,0.945644,06:32


Better model found at epoch 0 with val_loss value: 0.4219592809677124.
Better model found at epoch 2 with val_loss value: 0.3699333369731903.
Better model found at epoch 4 with val_loss value: 0.3573426306247711.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.368702,0.371736,0.021596,0.978360,0.950016,08:50
1,0.377440,0.546723,0.037559,0.962613,0.914437,08:46
2,0.380507,0.420996,0.038498,0.961228,0.910574,08:47
3,0.366991,0.359719,0.026291,0.973579,0.939064,08:54
4,0.373037,0.355673,0.019718,0.980226,0.954349,08:55
5,0.366695,0.360215,0.023474,0.976379,0.945685,08:54
6,0.354941,0.362004,0.016901,0.983085,0.960940,08:55
7,0.358803,0.351313,0.015023,0.984928,0.965253,08:49
8,0.349555,0.400863,0.022535,0.977517,0.948316,08:47
9,0.351634,0.355371,0.016901,0.983071,0.960906,08:46


Better model found at epoch 0 with val_loss value: 0.37173616886138916.
Better model found at epoch 3 with val_loss value: 0.3597185015678406.
Better model found at epoch 4 with val_loss value: 0.3556731343269348.
Better model found at epoch 7 with val_loss value: 0.351312518119812.


In [30]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Test, scale=1.)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)

submission = [0 for i in range(y_pred.shape[0])]

for fn, y in zip(learn.data.test_dl.items, np.argmax(y_pred[:, [1,0]], axis=1)):
    index = int(fn.name.replace(".bmp", '')) - 1
    submission[index] = y
    
f1_score(gt_labels, submission, average='weighted')

0.8231369747935328