In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

In [3]:
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *

In [4]:
from dataset_spliter import SplitByPatient
from metrics import *#F1Weighted, MCC
from losses import *

In [5]:
import pretrainedmodels

In [6]:
path = Path('/data/Datasets/WhiteBloodCancer/train/')

In [7]:
np.random.seed(42)

In [8]:
fnames = get_image_files(path, recurse=True)
fnames[:5]

[PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_43_1_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H22_31_15_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_9_11_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_28_6_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_189_1_hem.bmp')]

In [9]:
hem_regex = re.compile(r'UID_(H[0-9]+)_', re.IGNORECASE)
all_regex = re.compile(r'UID_([0-9]+)_', re.IGNORECASE)

In [10]:
hem_patient_ids = list(set([hem_regex.search(str(fn)).group(1)
                            for fn in fnames if hem_regex.search(str(fn)) is not None]))
all_patint_ids = list(set([all_regex.search(str(fn)).group(1)
                           for fn in fnames if all_regex.search(str(fn)) is not None]))

hem_patients = dict((k,[]) for k in hem_patient_ids)
all_patints = dict((k,[]) for k in all_patint_ids)

[all_patints[key].append(fn) for key in all_patints.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
[hem_patients[key].append(fn) for key in hem_patients.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
print()




## Split data into train val 

In [11]:
split_handler = SplitByPatient(hem_patients, all_patints)

### Split by regex

In [12]:
train_regex = re.compile(r'(fold_0|fold_1|fold_2)')
val_regex = re.compile(r'(fold_3)')

hem_train, all_train, hem_val, all_val = split_handler.split_by_regex(train_regex, val_regex)

In [13]:
print('Train Total: {0}'.format(len(hem_train)+len(all_train)))
print('Val Total: {0}'.format(len(hem_val)+len(all_val)))
print("")
print('Hem train: {}'.format(len(hem_train)))
print('All train: {}'.format(len(all_train)))
print('Hem val: {}'.format(len(hem_val)))
print('All val: {}'.format(len(all_val)))

Train Total: 10661
Val Total: 1867

Hem train: 3389
All train: 7272
Hem val: 648
All val: 1219


In [14]:
pat = re.compile(r'^.*(hem|all).bmp$')

def get_label(fn):
    return pat.search(str(fn)).group(1)

### Use complete image

In [15]:
train = ImageList(hem_train + all_train) #optinal scale up classes 
valid = ImageList(hem_val + all_val)

In [16]:
item_list = ItemLists(path, train, valid)
lls = item_list.label_from_func(get_label)

#### Data augmentation

In [17]:
xtra_tfms=[cutout(n_holes=5, length=0.2)]#squish(scale=0.66), 
tfms = get_transforms(do_flip=True, 
                      flip_vert=True, 
                      #max_rotate=90,  
                      max_lighting=0.05, 
                      max_zoom=1.05, 
                      max_warp=0.05,
                      #p_affine=0.75,
                      #p_lighting=0.75,  
                      #xtra_tfms=xtra_tfms,
                     )

#### Create dataset 

In [18]:
def get_data(bs, size):
    data  = ImageDataBunch.create_from_ll(lls, size=size, bs=bs, 
                                      ds_tfms=tfms, padding_mode='zeros',
                                      resize_method=ResizeMethod.PAD)
    data = data.normalize()
    #data = data.normalize((channel_mean, channel_std))
    return data
    

In [19]:
print(pretrainedmodels.model_names)

['fbresnet152', 'bninception', 'resnext101_32x4d', 'resnext101_64x4d', 'inceptionv4', 'inceptionresnetv2', 'alexnet', 'densenet121', 'densenet169', 'densenet201', 'densenet161', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'inceptionv3', 'squeezenet1_0', 'squeezenet1_1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19_bn', 'vgg19', 'nasnetamobile', 'nasnetalarge', 'dpn68', 'dpn68b', 'dpn92', 'dpn98', 'dpn131', 'dpn107', 'xception', 'senet154', 'se_resnet50', 'se_resnet101', 'se_resnet152', 'se_resnext50_32x4d', 'se_resnext101_32x4d', 'cafferesnet101', 'pnasnet5large', 'polynet']


In [20]:
def get_cadene_model(pretrained=True, model_name='se_resnext50_32x4d'):
    if pretrained:
        arch = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet')
    else:
        arch = pretrainedmodels.__dict__[model_name](num_classes=1000)
    return arch

In [21]:
size = 256
bs = 64
data = get_data(bs, size)

experiment_name = "baseline_se_resnext50_32x4d"
learn = create_cnn(data, get_cadene_model, 
                    cut=-2,
                       metrics=[error_rate, F1Weighted(), MCC()], #  
                       #loss_func=FocalLoss(num_classes=1),
                       #ps=0.75,
                       #wd=0.1,
                       loss_func = LabelSmoothingCrossEntropy(),
                       callback_fns=[partial(SaveModelCallback, name='stage1-{}-{}'.format(experiment_name, size))],

                  )#

  warn("`create_cnn` is deprecated and is now named `cnn_learner`.")


In [22]:
learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.536388,0.613834,0.262989,0.718146,0.383078,02:22
1,0.466669,0.595460,0.235672,0.743774,0.453376,02:24
2,0.423838,0.617161,0.237815,0.738929,0.448691,02:27
3,0.409386,0.541858,0.192287,0.796818,0.561486,02:28
4,0.398699,0.553398,0.194430,0.788157,0.563253,02:28


Better model found at epoch 0 with val_loss value: 0.613834023475647.
Better model found at epoch 1 with val_loss value: 0.5954598784446716.
Better model found at epoch 3 with val_loss value: 0.541857898235321.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.399317,0.540237,0.189073,0.795559,0.574716,03:08
1,0.394206,0.544941,0.202464,0.777333,0.545155,03:04
2,0.393540,0.512159,0.164435,0.829101,0.627673,03:05
3,0.386360,0.526169,0.171398,0.821919,0.611028,03:03
4,0.378318,0.541720,0.190680,0.799289,0.565100,03:02
5,0.375187,0.527252,0.171934,0.819330,0.610849,03:02
6,0.370875,0.531123,0.172469,0.817170,0.611313,03:02
7,0.365996,0.526108,0.174076,0.817569,0.605248,03:03
8,0.367102,0.536797,0.178897,0.809515,0.596304,03:03
9,0.363997,0.524645,0.171934,0.819207,0.610960,03:02


Better model found at epoch 0 with val_loss value: 0.54023677110672.
Better model found at epoch 2 with val_loss value: 0.5121586918830872.


In [23]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Valid, scale=1.1)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted')

0.8414091063854414

In [24]:
size = 384
bs = 32
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.416124,0.515282,0.173540,0.821578,0.606379,05:31
1,0.414445,0.592366,0.221210,0.754310,0.497070,05:22
2,0.396015,0.527887,0.176219,0.819746,0.600984,05:23
3,0.375123,0.501548,0.152116,0.844838,0.656999,05:23
4,0.371527,0.496737,0.133369,0.862296,0.700647,05:24


Better model found at epoch 0 with val_loss value: 0.5152815580368042.
Better model found at epoch 3 with val_loss value: 0.5015484690666199.
Better model found at epoch 4 with val_loss value: 0.4967368543148041.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.369992,0.490792,0.131227,0.864894,0.705316,07:11
1,0.369689,0.508152,0.151044,0.843541,0.659258,07:09
2,0.377834,0.497178,0.142475,0.853106,0.679071,07:07
3,0.364683,0.512571,0.146224,0.852659,0.673498,07:06
4,0.367209,0.553707,0.164435,0.830023,0.627460,07:07
5,0.357248,0.589048,0.175683,0.817911,0.600805,07:07
6,0.354264,0.559314,0.169255,0.827613,0.617987,07:05
7,0.348691,0.541622,0.161757,0.835883,0.636135,07:04
8,0.346238,0.565393,0.180503,0.818041,0.596472,07:05
9,0.350817,0.613792,0.178897,0.818910,0.598110,07:06


Better model found at epoch 0 with val_loss value: 0.49079224467277527.


In [25]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Valid, scale=1.1)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted')

0.8723919706598805

In [26]:
size = 450
bs = 16
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.411980,0.513928,0.155865,0.841368,0.648793,07:32
1,0.426865,0.567514,0.200857,0.781905,0.545104,07:22
2,0.409926,0.608666,0.128548,0.867980,0.711325,07:22
3,0.386564,0.593779,0.157472,0.839850,0.645278,07:29
4,0.378266,0.564243,0.151044,0.846460,0.660047,07:30


Better model found at epoch 0 with val_loss value: 0.5139284133911133.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.385768,0.624427,0.167649,0.822600,0.622861,10:05
1,0.404875,0.670856,0.181575,0.814649,0.589128,10:02
2,0.387827,0.624726,0.142475,0.857316,0.684766,09:59
3,0.373853,0.666294,0.166042,0.829001,0.623774,10:00
4,0.376201,0.572546,0.155329,0.844442,0.656320,10:00
5,0.372105,0.640008,0.179432,0.819785,0.601012,10:00
6,0.368230,0.549764,0.167649,0.832501,0.630782,09:57
7,0.365737,0.696244,0.173005,0.826901,0.617876,09:57
8,0.359260,0.552292,0.174612,0.825261,0.614190,09:56
9,0.359445,0.555878,0.174612,0.821249,0.604543,09:56


Better model found at epoch 0 with val_loss value: 0.6244266033172607.
Better model found at epoch 4 with val_loss value: 0.5725460052490234.
Better model found at epoch 6 with val_loss value: 0.5497637391090393.


In [27]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Valid, scale=1.1)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted')

0.8347965488282988