In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

In [3]:
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *

In [4]:
from dataset_spliter import SplitByPatient
from metrics import *#F1Weighted, MCC
from losses import *

In [5]:
path = Path('/data/Datasets/WhiteBloodCancer/train/')

In [6]:
np.random.seed(42)

In [7]:
fnames = get_image_files(path, recurse=True)
fnames[:5]

[PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_43_1_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H22_31_15_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_9_11_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_28_6_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_189_1_hem.bmp')]

In [8]:
hem_regex = re.compile(r'UID_(H[0-9]+)_', re.IGNORECASE)
all_regex = re.compile(r'UID_([0-9]+)_', re.IGNORECASE)

In [9]:
hem_patient_ids = list(set([hem_regex.search(str(fn)).group(1)
                            for fn in fnames if hem_regex.search(str(fn)) is not None]))
all_patint_ids = list(set([all_regex.search(str(fn)).group(1)
                           for fn in fnames if all_regex.search(str(fn)) is not None]))

hem_patients = dict((k,[]) for k in hem_patient_ids)
all_patints = dict((k,[]) for k in all_patint_ids)

[all_patints[key].append(fn) for key in all_patints.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
[hem_patients[key].append(fn) for key in hem_patients.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
print()




## Split data into train val 

In [10]:
split_handler = SplitByPatient(hem_patients, all_patints)

### Split by regex

In [11]:
train_regex = re.compile(r'(fold_0|fold_1|fold_2)')
val_regex = re.compile(r'(fold_3)')

hem_train, all_train, hem_val, all_val = split_handler.split_by_regex(train_regex, val_regex)

In [12]:
print('Train Total: {0}'.format(len(hem_train)+len(all_train)))
print('Val Total: {0}'.format(len(hem_val)+len(all_val)))
print("")
print('Hem train: {}'.format(len(hem_train)))
print('All train: {}'.format(len(all_train)))
print('Hem val: {}'.format(len(hem_val)))
print('All val: {}'.format(len(all_val)))

Train Total: 10661
Val Total: 1867

Hem train: 3389
All train: 7272
Hem val: 648
All val: 1219


In [13]:
pat = re.compile(r'^.*(hem|all).bmp$')

def get_label(fn):
    return pat.search(str(fn)).group(1)

### Use complete image

In [14]:
train = ImageList(hem_train + all_train) #optinal scale up classes 
valid = ImageList(hem_val + all_val)

In [15]:
item_list = ItemLists(path, train, valid)
lls = item_list.label_from_func(get_label).add_test_folder('../test')

In [16]:
help(get_transforms)

Help on function get_transforms in module fastai.vision.transform:

get_transforms(do_flip:bool=True, flip_vert:bool=False, max_rotate:float=10.0, max_zoom:float=1.1, max_lighting:float=0.2, max_warp:float=0.2, p_affine:float=0.75, p_lighting:float=0.75, xtra_tfms:Union[Collection[fastai.vision.image.Transform], NoneType]=None) -> Collection[fastai.vision.image.Transform]
    Utility func to easily create a list of flip, rotate, `zoom`, warp, lighting transforms.



#### Data augmentation

In [17]:
xtra_tfms=[cutout(n_holes=5, length=0.2)]#squish(scale=0.66), 
tfms = get_transforms(do_flip=True, 
                      flip_vert=True, 
                      #max_rotate=90,  
                      max_lighting=0.0, 
                      max_zoom=1.0, 
                      max_warp=0.0,
                      #p_affine=0.75,
                      #p_lighting=0.75,  
                      #xtra_tfms=xtra_tfms,
                     )

#### Create dataset 

In [18]:
def get_data(bs, size):
    data  = ImageDataBunch.create_from_ll(lls, size=size, bs=bs, 
                                      ds_tfms=tfms, padding_mode='zeros',
                                      resize_method=ResizeMethod.PAD)
    data = data.normalize()
    #data = data.normalize((channel_mean, channel_std))
    return data
    

In [79]:
size = 256
bs = 96
data = get_data(bs, size)

experiment_name = "baseline_rn18"
learn = create_cnn(data, models.resnet18, 
                       metrics=[error_rate, F1Weighted(), MCC()], #  
                       #loss_func=FocalLoss(num_classes=1),
                       #ps=0.75,
                       #wd=0.1,
                       loss_func = LabelSmoothingCrossEntropy(),
                       callback_fns=[partial(SaveModelCallback, name='stage1-{}-{}'.format(experiment_name, size))],

                  )#

  warn("`create_cnn` is deprecated and is now named `cnn_learner`.")


In [34]:
learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.552150,0.644865,0.292983,0.701225,0.333653,00:35
1,0.475226,0.589420,0.273165,0.731339,0.426304,00:35
2,0.436674,0.572046,0.216390,0.769936,0.501785,00:35
3,0.419283,0.557719,0.208356,0.775629,0.523566,00:35
4,0.402398,0.565304,0.217461,0.762960,0.502094,00:35


Better model found at epoch 0 with val_loss value: 0.6448649764060974.
Better model found at epoch 1 with val_loss value: 0.5894197821617126.
Better model found at epoch 2 with val_loss value: 0.5720462203025818.
Better model found at epoch 3 with val_loss value: 0.5577189326286316.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.400078,0.603543,0.244778,0.719572,0.439781,00:40
1,0.402475,0.610016,0.253348,0.734669,0.412646,00:40
2,0.394235,0.598622,0.222282,0.767510,0.488291,00:40
3,0.393461,0.570398,0.209963,0.780323,0.518096,00:40
4,0.383932,0.569813,0.217461,0.767392,0.499367,00:41
5,0.379306,0.504984,0.160150,0.834453,0.637579,00:41
6,0.371309,0.536154,0.172469,0.822777,0.608943,00:40
7,0.366575,0.550523,0.196572,0.794400,0.550553,00:40
8,0.361114,0.540503,0.173540,0.816692,0.607843,00:41
9,0.357518,0.531352,0.170862,0.821978,0.612458,00:41


Better model found at epoch 0 with val_loss value: 0.6035430431365967.
Better model found at epoch 2 with val_loss value: 0.5986219048500061.
Better model found at epoch 3 with val_loss value: 0.5703980326652527.
Better model found at epoch 4 with val_loss value: 0.5698125958442688.
Better model found at epoch 5 with val_loss value: 0.5049842000007629.


In [35]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Valid, scale=1.1)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted')

0.8510626411586184

In [80]:
size = 384
bs = 32
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.504805,0.655996,0.347616,0.627155,0.166099,01:08
1,0.447777,0.575684,0.222817,0.765940,0.486400,01:08
2,0.436602,0.674188,0.291912,0.714797,0.455537,01:08
3,0.410782,0.582243,0.195501,0.791600,0.554651,01:09
4,0.401369,0.572411,0.215319,0.767469,0.505980,01:09


Better model found at epoch 0 with val_loss value: 0.6559962630271912.
Better model found at epoch 1 with val_loss value: 0.5756844878196716.
Better model found at epoch 4 with val_loss value: 0.5724114179611206.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.400704,0.625886,0.247991,0.717443,0.428165,01:27
1,0.398873,0.537139,0.164435,0.826371,0.630287,01:27
2,0.401298,0.679890,0.279593,0.662550,0.346815,01:27
3,0.391429,0.647324,0.258704,0.706500,0.395216,01:27
4,0.385348,0.562008,0.200857,0.785565,0.541339,01:27
5,0.388988,0.588072,0.216390,0.756917,0.515257,01:27
6,0.382556,0.540338,0.185324,0.801304,0.581821,01:27
7,0.376487,0.591339,0.222817,0.760917,0.485888,01:27
8,0.377130,0.545738,0.191216,0.797662,0.564307,01:27
9,0.366375,0.549655,0.189609,0.797027,0.570466,01:27


Better model found at epoch 0 with val_loss value: 0.6258857250213623.
Better model found at epoch 1 with val_loss value: 0.5371392369270325.


In [81]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Valid, scale=1.1)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted')

0.8449543268449125

In [82]:
size = 450
bs = 16
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.457580,0.652675,0.337975,0.669931,0.353548,01:48
1,0.465873,0.551959,0.192823,0.794687,0.561215,01:49
2,0.448698,0.531597,0.177825,0.824930,0.627577,01:52
3,0.424695,0.562158,0.212641,0.764279,0.520238,01:52
4,0.412568,0.605090,0.213712,0.763092,0.517264,01:48


Better model found at epoch 0 with val_loss value: 0.652675449848175.
Better model found at epoch 1 with val_loss value: 0.5519586205482483.
Better model found at epoch 2 with val_loss value: 0.5315967798233032.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.419806,0.604426,0.230852,0.759776,0.468692,02:18
1,0.411353,0.573124,0.191751,0.795828,0.563914,02:18
2,0.418104,0.581377,0.229780,0.754120,0.468000,02:17
3,0.422758,0.626671,0.268345,0.699313,0.365719,02:17
4,0.400484,0.546839,0.177825,0.820075,0.600723,02:18
5,0.410840,0.570020,0.215854,0.760027,0.512381,02:18
6,0.390066,0.569081,0.211034,0.771719,0.517454,02:18
7,0.396136,0.564687,0.206749,0.780831,0.525798,02:18
8,0.384900,0.605694,0.222817,0.758366,0.486993,02:18
9,0.381079,0.591652,0.222282,0.756961,0.489868,02:18


Better model found at epoch 0 with val_loss value: 0.6044260263442993.
Better model found at epoch 1 with val_loss value: 0.5731244087219238.
Better model found at epoch 4 with val_loss value: 0.5468387007713318.


In [84]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Valid, scale=1.15)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted')

0.7780326512525405

In [39]:
#learn.export('baseline_rn50-450-2.pkl') 

In [76]:
y_pred_list = []
scores = []
for i in range(10):
    y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Valid, scale=1.15)#, beta=0.4, scale=1.3
    y_pred = to_np(y_pred)
    scores.append(f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted'))
    y_pred_list.append(y_pred)

In [78]:
result = y_pred.copy()
result[:, 0] = np.mean([y_temp[:, 0] for y_temp in y_pred_list], axis=0)
result[:, 1] = np.mean([y_temp[:, 1] for y_temp in y_pred_list], axis=0)

f1_score(y_test_tta, np.argmax(result, axis=1),  average='weighted')

0.8657238559840487

In [77]:
# scale=1.2
scores

[0.8668572530257317,
 0.8664000560949924,
 0.8654982610522881,
 0.8668572530257317,
 0.8635668094938325,
 0.8642452182900731,
 0.8668023792291565,
 0.866345601085412,
 0.8651004724922897,
 0.8669657242412542]

In [75]:
# scale=1.15
scores

[0.867276947154552,
 0.8683648410303343,
 0.8674693440225665,
 0.8671501800090472,
 0.866889558959182,
 0.8676628019063917,
 0.8694527349061166,
 0.8681755728836665,
 0.8699037213621822,
 0.867852174341416]

In [69]:
# scale=1.1
scores

[0.8617516385749614,
 0.862409019853733,
 0.8634335448515753,
 0.8612397062146528,
 0.8612397062146528,
 0.8628488477926204,
 0.8617516385749614,
 0.8605059055715976,
 0.8605804189491731,
 0.8597048935005746]

In [73]:
# scale=1.05
scores

[0.8482498256359229,
 0.850639560768928,
 0.8498672045749301,
 0.8527666774416518,
 0.8539586816407294,
 0.8511495199583856,
 0.8510626411586184,
 0.8522562149772824,
 0.852341887678882,
 0.8509816037351615]

In [71]:
# scale=1.
scores

[0.8418560196071883,
 0.8397275214640779,
 0.8413481784481376,
 0.8424615245642911,
 0.8426546201768866,
 0.8390219446199435,
 0.8453873797746116,
 0.8409380957393486,
 0.84316273775698,
 0.8438615516945739]

In [62]:
f1_score(y_test_tta, np.argmax(result, axis=1),  average='weighted') # scale=1.

0.8616782109259512

In [28]:
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted') # scale=1.05

0.8470065158507172

In [29]:
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted') # scale=1.1

0.8470065158507172

In [30]:
f1_score(y_test_tta, np.argmax(y_pred, axis=1),  average='weighted') # scale=1.1

0.8470065158507172

In [31]:
y_pred

array([[-0.803526,  0.718772],
       [ 0.755342, -0.701467],
       [-1.451886,  1.574913],
       [-0.749864,  0.639907],
       ...,
       [ 1.200242, -1.036347],
       [ 1.18679 , -1.059086],
       [ 1.072764, -0.950764],
       [ 0.734014, -0.62426 ]], dtype=float32)

In [32]:
f1_score(y_test_tta, y_pred[:, 1] > 0.5,  average='weighted') # scale=1.1

0.8013814951791834