In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

In [3]:
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *

In [4]:
from dataset_spliter import SplitByPatient
from metrics import *#F1Weighted, MCC
from losses import *

In [5]:
import pretrainedmodels

In [6]:
path = Path('/data/Datasets/WhiteBloodCancer/train/')

In [7]:
np.random.seed(42)

In [8]:
fnames = get_image_files(path, recurse=True)
fnames[:5]

[PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_43_1_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H22_31_15_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_9_11_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_28_6_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_189_1_hem.bmp')]

In [9]:
hem_regex = re.compile(r'UID_(H[0-9]+)_', re.IGNORECASE)
all_regex = re.compile(r'UID_([0-9]+)_', re.IGNORECASE)

In [10]:
hem_patient_ids = list(set([hem_regex.search(str(fn)).group(1)
                            for fn in fnames if hem_regex.search(str(fn)) is not None]))
all_patint_ids = list(set([all_regex.search(str(fn)).group(1)
                           for fn in fnames if all_regex.search(str(fn)) is not None]))

hem_patients = dict((k,[]) for k in hem_patient_ids)
all_patints = dict((k,[]) for k in all_patint_ids)

[all_patints[key].append(fn) for key in all_patints.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
[hem_patients[key].append(fn) for key in hem_patients.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
print()




## Split data into train val 

In [11]:
split_handler = SplitByPatient(hem_patients, all_patints)

### Split by regex

In [12]:
train_regex = re.compile(r'(fold_0|fold_1|fold_2)')
val_regex = re.compile(r'(fold_3)')

hem_train, all_train, hem_val, all_val = split_handler.split_by_regex(train_regex, val_regex)

In [13]:
print('Train Total: {0}'.format(len(hem_train)+len(all_train)))
print('Val Total: {0}'.format(len(hem_val)+len(all_val)))
print("")
print('Hem train: {}'.format(len(hem_train)))
print('All train: {}'.format(len(all_train)))
print('Hem val: {}'.format(len(hem_val)))
print('All val: {}'.format(len(all_val)))

Train Total: 10661
Val Total: 1867

Hem train: 3389
All train: 7272
Hem val: 648
All val: 1219


In [14]:
pat = re.compile(r'^.*(hem|all).bmp$')

def get_label(fn):
    return pat.search(str(fn)).group(1)

### Use complete image

In [15]:
train = ImageList(hem_train + all_train) #optinal scale up classes 
valid = ImageList(hem_val + all_val)

In [16]:
item_list = ItemLists(path, train, valid)
lls = item_list.label_from_func(get_label)

#### Data augmentation

In [17]:
xtra_tfms=[cutout(n_holes=5, length=0.2)]#squish(scale=0.66), 
tfms = get_transforms(do_flip=True, 
                      flip_vert=True, 
                      #max_rotate=90,  
                      #max_lighting=0.15, 
                      #max_zoom=1.5, 
                      #max_warp=0.2,
                      #p_affine=0.75,
                      #p_lighting=0.75,  
                      #xtra_tfms=xtra_tfms,
                     )

#### Create dataset 

In [18]:
def get_data(bs, size):
    data  = ImageDataBunch.create_from_ll(lls, size=size, bs=bs, 
                                      ds_tfms=tfms, padding_mode='zeros',
                                      resize_method=ResizeMethod.PAD)
    data = data.normalize()
    #data = data.normalize((channel_mean, channel_std))
    return data
    

In [19]:
print(pretrainedmodels.model_names)

['fbresnet152', 'bninception', 'resnext101_32x4d', 'resnext101_64x4d', 'inceptionv4', 'inceptionresnetv2', 'alexnet', 'densenet121', 'densenet169', 'densenet201', 'densenet161', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'inceptionv3', 'squeezenet1_0', 'squeezenet1_1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19_bn', 'vgg19', 'nasnetamobile', 'nasnetalarge', 'dpn68', 'dpn68b', 'dpn92', 'dpn98', 'dpn131', 'dpn107', 'xception', 'senet154', 'se_resnet50', 'se_resnet101', 'se_resnet152', 'se_resnext50_32x4d', 'se_resnext101_32x4d', 'cafferesnet101', 'pnasnet5large', 'polynet']


In [20]:
def get_cadene_model(pretrained=True, model_name='resnet50'):
    if pretrained:
        arch = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet')
    else:
        arch = pretrainedmodels.__dict__[model_name](num_classes=1000)
    return arch

In [21]:
size = 256
bs = 64
data = get_data(bs, size)

experiment_name = "baseline_resnet50_normal_loss"
learn = create_cnn(data, models.resnet50, 
                    cut=-2,
                       metrics=[error_rate, F1Weighted(), MCC()], #  
                       #loss_func=FocalLoss(num_classes=1),
                       #ps=0.75,
                       #wd=0.1,
                       #loss_func = LabelSmoothingCrossEntropy(),
                       callback_fns=[partial(SaveModelCallback, name='stage1-{}-{}'.format(experiment_name, size))],

                  )#

  warn("`create_cnn` is deprecated and is now named `cnn_learner`.")


In [22]:
learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.376431,1.059684,0.442421,0.549333,0.276889,01:20
1,0.289513,1.168413,0.321907,0.577547,0.205250,01:19
2,0.255528,0.765178,0.249598,0.710617,0.429791,01:19
3,0.210440,0.671337,0.257097,0.705947,0.401898,01:19
4,0.184544,0.604016,0.246920,0.730966,0.423654,01:19


Better model found at epoch 0 with val_loss value: 1.059683918952942.
Better model found at epoch 2 with val_loss value: 0.7651776671409607.
Better model found at epoch 3 with val_loss value: 0.6713372468948364.
Better model found at epoch 4 with val_loss value: 0.6040158271789551.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.178630,0.567301,0.237279,0.744068,0.448780,01:42
1,0.187946,0.670478,0.272094,0.711769,0.363012,01:41
2,0.192514,0.531172,0.235137,0.751187,0.455182,01:41
3,0.163430,0.545012,0.228174,0.775408,0.520377,01:41
4,0.169298,0.464664,0.206213,0.778116,0.528955,01:41
5,0.158911,0.594985,0.232994,0.755001,0.461351,01:41
6,0.148820,0.448740,0.182646,0.812268,0.585163,01:41
7,0.122408,0.423121,0.170327,0.826692,0.615795,01:41
8,0.107843,0.445159,0.161221,0.834380,0.635194,01:41
9,0.104811,0.456508,0.165506,0.831306,0.626404,01:41


Better model found at epoch 0 with val_loss value: 0.5673012733459473.
Better model found at epoch 2 with val_loss value: 0.5311722755432129.
Better model found at epoch 4 with val_loss value: 0.4646637439727783.
Better model found at epoch 6 with val_loss value: 0.44873955845832825.
Better model found at epoch 7 with val_loss value: 0.42312145233154297.


In [23]:
learn.export('Rn50Normal_256.pkl')  

In [35]:
size = 384
bs = 32
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.178923,0.686964,0.279057,0.658265,0.356457,02:52
1,0.214986,0.330662,0.126406,0.871809,0.716509,02:54
2,0.159949,0.588598,0.214247,0.763281,0.514595,02:56
3,0.144969,0.518897,0.199786,0.785031,0.545546,03:50
4,0.122823,0.486274,0.175147,0.814798,0.604080,02:54


Better model found at epoch 0 with val_loss value: 0.6869637966156006.
Better model found at epoch 1 with val_loss value: 0.33066174387931824.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.152799,0.676796,0.271559,0.705385,0.358758,05:49
1,0.171696,0.499288,0.230852,0.753354,0.465288,04:01
2,0.169210,0.790563,0.329941,0.661517,0.243154,04:18
3,0.187626,0.502725,0.228174,0.741819,0.484266,04:35
4,0.149029,0.620102,0.235672,0.748492,0.453104,04:01
5,0.130585,0.589757,0.216390,0.759082,0.511441,04:01
6,0.110019,0.441438,0.179968,0.811950,0.590705,04:55
7,0.109049,0.381998,0.151044,0.845375,0.659078,04:54
8,0.108044,0.450808,0.173005,0.819859,0.607278,04:55
9,0.107514,0.430603,0.170862,0.822639,0.612278,04:01


Better model found at epoch 0 with val_loss value: 0.676796019077301.
Better model found at epoch 1 with val_loss value: 0.4992881417274475.
Better model found at epoch 6 with val_loss value: 0.4414377510547638.
Better model found at epoch 7 with val_loss value: 0.3819984793663025.


In [36]:
learn.export('Rn50Normal_368.pkl')  

In [37]:
size = 450
bs = 16
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.253565,0.598977,0.265131,0.713792,0.376349,04:27
1,0.237381,1.142513,0.389395,0.565405,0.023470,04:23
2,0.212214,0.642428,0.290305,0.683282,0.307940,04:23
3,0.186891,0.524883,0.198715,0.786012,0.548500,04:22
4,0.139751,0.448224,0.179432,0.808438,0.595649,04:22


Better model found at epoch 0 with val_loss value: 0.598976731300354.
Better model found at epoch 3 with val_loss value: 0.5248825550079346.
Better model found at epoch 4 with val_loss value: 0.448224276304245.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.123519,0.485007,0.158543,0.835477,0.641604,06:13
1,0.183330,0.762213,0.229780,0.768752,0.487277,06:12
2,0.210677,0.712831,0.301018,0.684640,0.296687,06:12
3,0.208672,8.009492,0.347081,0.578224,0.099685,06:12
4,0.160464,0.562110,0.230852,0.752974,0.465271,06:12
5,0.189061,0.494708,0.231923,0.750851,0.462553,06:12
6,0.132546,0.656076,0.216926,0.764457,0.502722,06:14
7,0.143084,0.528106,0.191216,0.798093,0.564063,06:12
8,0.113051,0.560659,0.169255,0.826641,0.616854,06:12
9,0.129766,0.426231,0.174612,0.817069,0.603907,06:13


Better model found at epoch 0 with val_loss value: 0.48500701785087585.
Better model found at epoch 9 with val_loss value: 0.4262314736843109.


In [38]:
learn.export('Rn50Normal_450.pkl')  

In [41]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Valid, scale=1.)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)

In [40]:
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted') # 1.1

0.8294195120287017

In [42]:
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted') # 1.0

0.8213079483573761