In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

In [3]:
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *

In [4]:
from dataset_spliter import SplitByPatient
from metrics import *#F1Weighted, MCC
from losses import *

In [5]:
path = Path('/data/Datasets/WhiteBloodCancer/train/')

In [6]:
np.random.seed(42)

In [7]:
fnames = get_image_files(path, recurse=True)
fnames[:5]

[PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_43_1_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H22_31_15_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_9_11_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_28_6_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_189_1_hem.bmp')]

In [8]:
hem_regex = re.compile(r'UID_(H[0-9]+)_', re.IGNORECASE)
all_regex = re.compile(r'UID_([0-9]+)_', re.IGNORECASE)

In [9]:
hem_patient_ids = list(set([hem_regex.search(str(fn)).group(1)
                            for fn in fnames if hem_regex.search(str(fn)) is not None]))
all_patint_ids = list(set([all_regex.search(str(fn)).group(1)
                           for fn in fnames if all_regex.search(str(fn)) is not None]))

hem_patients = dict((k,[]) for k in hem_patient_ids)
all_patints = dict((k,[]) for k in all_patint_ids)

[all_patints[key].append(fn) for key in all_patints.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
[hem_patients[key].append(fn) for key in hem_patients.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
print()




## Split data into train val 

In [10]:
split_handler = SplitByPatient(hem_patients, all_patints)

### Split by regex

In [11]:
train_regex = re.compile(r'(fold_0|fold_1|fold_2)')
val_regex = re.compile(r'(fold_3)')

hem_train, all_train, hem_val, all_val = split_handler.split_by_regex(train_regex, val_regex)

In [12]:
print('Train Total: {0}'.format(len(hem_train)+len(all_train)))
print('Val Total: {0}'.format(len(hem_val)+len(all_val)))
print("")
print('Hem train: {}'.format(len(hem_train)))
print('All train: {}'.format(len(all_train)))
print('Hem val: {}'.format(len(hem_val)))
print('All val: {}'.format(len(all_val)))

Train Total: 10661
Val Total: 1867

Hem train: 3389
All train: 7272
Hem val: 648
All val: 1219


In [13]:
pat = re.compile(r'^.*(hem|all).bmp$')

def get_label(fn):
    return pat.search(str(fn)).group(1)

### Use complete image

In [14]:
train = ImageList(hem_train + all_train) #optinal scale up classes 
valid = ImageList(hem_val + all_val)

In [15]:
item_list = ItemLists(path, train, valid)
lls = item_list.label_from_func(get_label).add_test_folder('../test')

#### Data augmentation

In [16]:
xtra_tfms=[cutout(n_holes=5, length=0.2)]#squish(scale=0.66), 
tfms = get_transforms(do_flip=True, 
                      flip_vert=True, 
                      #max_rotate=90,  
                      max_lighting=0.05, 
                      max_zoom=1.0, 
                      max_warp=0.0,
                      #p_affine=0.75,
                      #p_lighting=0.75,  
                      #xtra_tfms=xtra_tfms,
                     )

#### Create dataset 

In [17]:
def get_data(bs, size):
    data  = ImageDataBunch.create_from_ll(lls, size=size, bs=bs, 
                                      ds_tfms=tfms, padding_mode='zeros',
                                      resize_method=ResizeMethod.PAD)
    data = data.normalize()
    #data = data.normalize((channel_mean, channel_std))
    return data
    

In [18]:
size = 256
bs = 96
data = get_data(bs, size)

experiment_name = "baseline_rn50"
learn = create_cnn(data, models.resnet50, 
                       metrics=[error_rate, F1Weighted(), MCC()], #  
                       #loss_func=FocalLoss(num_classes=1),
                       #ps=0.75,
                       #wd=0.1,
                       loss_func = LabelSmoothingCrossEntropy(),
                       callback_fns=[partial(SaveModelCallback, name='stage1-{}-{}'.format(experiment_name, size))],

                  )#

  warn("`create_cnn` is deprecated and is now named `cnn_learner`.")


In [19]:
learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.543462,0.834194,0.321907,0.574571,0.210695,01:25
1,0.480572,0.776375,0.311194,0.594410,0.259257,01:22
2,0.451669,0.645867,0.243706,0.730098,0.433825,01:23
3,0.424776,0.739006,0.302625,0.629405,0.266211,01:23
4,0.408837,0.621898,0.248527,0.724103,0.420776,01:23


Better model found at epoch 0 with val_loss value: 0.834194004535675.
Better model found at epoch 1 with val_loss value: 0.7763753533363342.
Better model found at epoch 2 with val_loss value: 0.6458670496940613.
Better model found at epoch 4 with val_loss value: 0.6218984723091125.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.406568,0.647185,0.260846,0.694506,0.398081,01:48
1,0.415829,0.623851,0.247456,0.737868,0.424376,01:47
2,0.411498,0.616195,0.259239,0.721171,0.392133,01:46
3,0.405271,0.688333,0.299946,0.680384,0.290968,01:47
4,0.401826,0.607267,0.238350,0.738459,0.447181,01:48
5,0.400486,0.605336,0.230316,0.751881,0.466792,01:47
6,0.387401,0.532862,0.174612,0.815948,0.604796,01:48
7,0.379939,0.507958,0.155865,0.839254,0.647626,01:47
8,0.374523,0.532064,0.175683,0.817242,0.600824,01:46
9,0.370002,0.528605,0.177290,0.815512,0.596984,01:46


Better model found at epoch 0 with val_loss value: 0.6471847295761108.
Better model found at epoch 1 with val_loss value: 0.6238506436347961.
Better model found at epoch 2 with val_loss value: 0.6161949038505554.
Better model found at epoch 4 with val_loss value: 0.607267439365387.
Better model found at epoch 5 with val_loss value: 0.6053359508514404.
Better model found at epoch 6 with val_loss value: 0.53286212682724.
Better model found at epoch 7 with val_loss value: 0.5079582929611206.


In [20]:
#learn.export('baseline_rn50-1.pkl') 

In [21]:
size = 384
bs = 32
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.441024,0.731384,0.271023,0.671104,0.381027,03:14
1,0.450072,0.652584,0.292448,0.653001,0.296808,03:10
2,0.436639,0.577180,0.245849,0.726945,0.428315,03:10
3,0.404000,0.580802,0.209963,0.778532,0.517710,03:08
4,0.388377,0.513969,0.158543,0.833570,0.643379,03:08


Better model found at epoch 0 with val_loss value: 0.7313843369483948.
Better model found at epoch 1 with val_loss value: 0.6525838971138.
Better model found at epoch 2 with val_loss value: 0.5771800875663757.
Better model found at epoch 4 with val_loss value: 0.5139691829681396.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.389419,0.537585,0.181039,0.803838,0.596510,04:20
1,0.406067,0.569792,0.223353,0.776847,0.508097,04:19
2,0.407945,0.674568,0.286020,0.703727,0.339106,04:18
3,0.411318,0.558612,0.196572,0.782912,0.563045,04:18
4,0.399804,0.617918,0.259775,0.725507,0.394339,04:18
5,0.406525,0.577262,0.220675,0.755951,0.497181,04:19
6,0.390883,0.523354,0.170327,0.821909,0.614075,04:18
7,0.378693,0.562817,0.200321,0.789995,0.541408,04:19
8,0.378478,0.560085,0.170862,0.823379,0.612281,04:20
9,0.364009,0.535001,0.178897,0.815020,0.593288,04:18


Better model found at epoch 0 with val_loss value: 0.537585437297821.
Better model found at epoch 6 with val_loss value: 0.5233539938926697.


In [22]:
#learn.export('baseline_rn50-384-2.pkl') 

In [23]:
size = 450
bs = 16
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.440644,2.231610,0.426888,0.569796,0.285335,04:54
1,0.477800,0.679559,0.344403,0.617485,0.153297,04:47
2,0.450043,0.829899,0.289234,0.649965,0.312867,04:47
3,0.411599,0.630321,0.254419,0.712574,0.406955,04:47
4,0.405160,0.633543,0.204071,0.773346,0.544899,04:52


Better model found at epoch 0 with val_loss value: 2.2316102981567383.
Better model found at epoch 1 with val_loss value: 0.6795588135719299.
Better model found at epoch 3 with val_loss value: 0.6303207278251648.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.420464,0.553765,0.200857,0.787913,0.540222,06:43
1,0.410273,0.537599,0.159079,0.834668,0.640468,06:41
2,0.409543,0.737893,0.273701,0.679838,0.355962,06:40
3,0.427937,0.616124,0.245313,0.732133,0.428018,06:40
4,0.425063,0.705020,0.181575,0.808675,0.587596,06:41
5,0.410730,2.028776,0.171934,0.819084,0.611076,06:42
6,0.404938,0.542223,0.174076,0.818629,0.604740,06:40
7,0.392850,0.626848,0.224424,0.759477,0.481710,06:39
8,0.382816,0.698245,0.202464,0.786441,0.536210,06:39
9,0.377080,0.617847,0.210498,0.774578,0.517230,06:39


Better model found at epoch 0 with val_loss value: 0.5537654757499695.
Better model found at epoch 1 with val_loss value: 0.5375993847846985.


In [24]:
#learn.export('baseline_rn50-450-2.pkl') 

In [60]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Valid, scale=1.)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)

In [46]:
y_pred

array([[-5.162198, -1.490607],
       [-0.472721,  0.483008],
       [-1.09513 ,  1.097183],
       [ 0.170305, -0.110243],
       ...,
       [-0.207251,  0.269644],
       [ 1.019599, -1.076509],
       [ 0.99041 , -1.020569],
       [ 0.88745 , -0.929821]], dtype=float32)

In [31]:
preds_test, y_test=learn.get_preds(ds_type=DatasetType.Valid)
preds_test = np.argmax(torch.sigmoid(preds_test), axis=1)
f1_score(y_test, preds_test, average='weighted')

0.8371988263137253

In [39]:
temp, y_test=learn.get_preds(ds_type=DatasetType.Valid)
torch.sigmoid(temp)

tensor([[0.0036, 0.1557],
        [0.4221, 0.5946],
        [0.2340, 0.7610],
        ...,
        [0.7318, 0.2595],
        [0.7258, 0.2686],
        [0.7051, 0.2866]])

In [28]:
y_test_tta

tensor([1, 1, 1,  ..., 0, 0, 0])

In [27]:
np.argmax(y_pred, axis=1)

array([1, 1, 1, 1, ..., 1, 0, 0, 0])

In [61]:
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted') # scale=1.

0.8489361299205921

In [53]:
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted') # scale=1.05

0.853021900669273

In [55]:
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted') # scale=1.1

0.8571500797662559

In [57]:
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted') # scale=1.1

0.8542246159029407

In [58]:
y_pred

array([[-2.231739,  2.114458],
       [-0.623906,  0.434116],
       [-1.139656,  1.125034],
       [ 0.107901, -0.033532],
       ...,
       [-0.37961 ,  0.44796 ],
       [ 1.010885, -1.083084],
       [ 0.954223, -0.974958],
       [ 0.880328, -0.924473]], dtype=float32)

In [59]:
f1_score(y_test_tta, y_pred[:, 1] > 0.5,  average='weighted') # scale=1.1

0.6631191427043902