In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

In [3]:
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *

In [4]:
from dataset_spliter import SplitByPatient
from metrics import *#F1Weighted, MCC
from losses import *

In [5]:
import pretrainedmodels

In [27]:
path = Path('/data/Datasets/WhiteBloodCancer/train/')
test_path = Path('/data/Datasets/WhiteBloodCancer/test/')

In [7]:
np.random.seed(42)

In [8]:
fnames = get_image_files(path, recurse=True)
fnames[:5]

[PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_43_1_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H22_31_15_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_9_11_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_28_6_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_189_1_hem.bmp')]

In [9]:
len(fnames)

12528

In [10]:
train_files_regex = re.compile(r'(fold_0|fold_1|fold_2)')

fnames = [fn for fn in fnames if train_files_regex.search(str(fn)) is not None]
len(fnames)

10661

In [11]:
hem_regex = re.compile(r'UID_(H[0-9]+)_', re.IGNORECASE)
all_regex = re.compile(r'UID_([0-9]+)_', re.IGNORECASE)

In [12]:
hem_patient_ids = list(set([hem_regex.search(str(fn)).group(1)
                            for fn in fnames if hem_regex.search(str(fn)) is not None]))
all_patint_ids = list(set([all_regex.search(str(fn)).group(1)
                           for fn in fnames if all_regex.search(str(fn)) is not None]))

hem_patients = dict((k,[]) for k in hem_patient_ids)
all_patints = dict((k,[]) for k in all_patint_ids)

[all_patints[key].append(fn) for key in all_patints.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
[hem_patients[key].append(fn) for key in hem_patients.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
print()




## Split data into train val 

In [13]:
split_handler = SplitByPatient(hem_patients, all_patints)

### Split by RND

In [14]:
hem_train, all_train, hem_val, all_val = split_handler.random_split(0.1)

In [15]:
print('Train Total: {0}'.format(len(hem_train)+len(all_train)))
print('Val Total: {0}'.format(len(hem_val)+len(all_val)))
print("")
print('Hem train: {}'.format(len(hem_train)))
print('All train: {}'.format(len(all_train)))
print('Hem val: {}'.format(len(hem_val)))
print('All val: {}'.format(len(all_val)))

Train Total: 9596
Val Total: 1065

Hem train: 3051
All train: 6545
Hem val: 338
All val: 727


In [16]:
pat = re.compile(r'^.*(hem|all).bmp$')

def get_label(fn):
    return pat.search(str(fn)).group(1)

### Use complete image

In [17]:
train = ImageList(hem_train + all_train) #optinal scale up classes 
valid = ImageList(hem_val + all_val)

In [30]:
item_list = ItemLists(path, train, valid)
lls = item_list.label_from_func(get_label).add_test_folder('../test')

In [31]:
lls

LabelLists;

Train: LabelList (9596 items)
x: ImageList
Image (3, 450, 450),Image (3, 450, 450),Image (3, 450, 450),Image (3, 450, 450),Image (3, 450, 450)
y: CategoryList
hem,hem,hem,hem,hem
Path: .;

Valid: LabelList (1065 items)
x: ImageList
Image (3, 450, 450),Image (3, 450, 450),Image (3, 450, 450),Image (3, 450, 450),Image (3, 450, 450)
y: CategoryList
hem,hem,hem,hem,hem
Path: .;

Test: LabelList (1867 items)
x: ImageList
Image (3, 450, 450),Image (3, 450, 450),Image (3, 450, 450),Image (3, 450, 450),Image (3, 450, 450)
y: EmptyLabelList
,,,,
Path: .

#### Data augmentation

In [32]:
xtra_tfms=[cutout(n_holes=5, length=0.2)]#squish(scale=0.66), 
tfms = get_transforms(do_flip=True, 
                      flip_vert=True, 
                      #max_rotate=90,  
                      max_lighting=0.05, 
                      max_zoom=1.05, 
                      max_warp=0.05,
                      #p_affine=0.75,
                      #p_lighting=0.75,  
                      #xtra_tfms=xtra_tfms,
                     )

#### Create dataset 

In [38]:
import pandas as pd

file = "/server/born_pix/EPA_DATASETS/WhiteBloodCancer/VAL_ISBI_labelfile_Source_reference_prediction.csv"
dataset = pd.read_csv(file, delimiter=';')
gt_labels = np.array(dataset.loc[:, 'labels'])

In [33]:
def get_data(bs, size):
    data  = ImageDataBunch.create_from_ll(lls, size=size, bs=bs, 
                                      ds_tfms=tfms, padding_mode='zeros',
                                      resize_method=ResizeMethod.PAD)
    data = data.normalize()
    #data = data.normalize((channel_mean, channel_std))
    return data
    

In [34]:
size = 256
bs = 64
data = get_data(bs, size)

experiment_name = "baseline_resnet18_rnd"
learn = create_cnn(data, models.resnet18, 
                    #cut=-2,
                       metrics=[error_rate, F1Weighted(), MCC()], #  
                       #loss_func=FocalLoss(num_classes=1),
                       #ps=0.75,
                       #wd=0.1,
                       loss_func = LabelSmoothingCrossEntropy(),
                       callback_fns=[partial(SaveModelCallback, name='stage1-{}-{}'.format(experiment_name, size))],

                  )#

  warn("`create_cnn` is deprecated and is now named `cnn_learner`.")


In [36]:
learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.188646,0.210076,0.074178,0.924520,0.826499,00:43
1,0.232753,0.254399,0.095775,0.901783,0.774903,00:38
2,0.212372,0.220207,0.088263,0.908339,0.795570,00:43
3,0.175112,0.186329,0.069484,0.931124,0.843728,00:43
4,0.147075,0.139581,0.053521,0.946237,0.875647,00:43


Better model found at epoch 0 with val_loss value: 0.21007630228996277.
Better model found at epoch 3 with val_loss value: 0.1863294392824173.
Better model found at epoch 4 with val_loss value: 0.1395806074142456.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.144757,0.143202,0.055399,0.944444,0.871533,00:41
1,0.161585,0.148324,0.057277,0.942700,0.867707,00:45
2,0.160265,0.155618,0.057277,0.943127,0.870485,00:45
3,0.155733,0.153532,0.049765,0.950407,0.886125,00:40
4,0.138970,0.164983,0.056338,0.942349,0.870175,00:46
5,0.125898,0.165138,0.062911,0.937489,0.857302,00:49
6,0.112679,0.138441,0.053521,0.946742,0.878078,00:50
7,0.106739,0.127999,0.047887,0.952348,0.890953,00:39
8,0.084717,0.113543,0.040376,0.959608,0.906745,00:31
9,0.090900,0.118803,0.039437,0.960594,0.909136,00:44


Better model found at epoch 0 with val_loss value: 0.14320197701454163.
Better model found at epoch 6 with val_loss value: 0.1384405791759491.
Better model found at epoch 7 with val_loss value: 0.12799926102161407.
Better model found at epoch 8 with val_loss value: 0.11354298889636993.


In [42]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Test, scale=1.)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)

submission = [0 for i in range(y_pred.shape[0])]

for fn, y in zip(learn.data.test_dl.items, np.argmax(y_pred[:, [1,0]], axis=1)):
    index = int(fn.name.replace(".bmp", '')) - 1
    submission[index] = y
    
f1_score(gt_labels, submission, average='weighted')

0.8319007775235077

In [43]:
size = 384
bs = 32
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.193258,0.467333,0.184038,0.789758,0.574010,01:19
1,0.211390,0.425641,0.205634,0.799919,0.570227,01:21
2,0.175473,0.202042,0.072300,0.926053,0.831494,01:20
3,0.140331,0.131320,0.055399,0.944005,0.870894,01:22
4,0.114972,0.094662,0.033803,0.966143,0.921773,01:20


Better model found at epoch 0 with val_loss value: 0.467333048582077.
Better model found at epoch 1 with val_loss value: 0.42564108967781067.
Better model found at epoch 2 with val_loss value: 0.20204240083694458.
Better model found at epoch 3 with val_loss value: 0.13132040202617645.
Better model found at epoch 4 with val_loss value: 0.09466152638196945.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.128158,0.097994,0.031925,0.968050,0.926212,01:24
1,0.123745,0.134486,0.048826,0.951501,0.889555,01:25
2,0.141623,0.132505,0.050704,0.949131,0.882368,01:23
3,0.140137,0.166885,0.061972,0.938883,0.864421,01:25
4,0.125439,0.124252,0.043192,0.956973,0.901307,01:22
5,0.115962,0.088244,0.031925,0.968075,0.926320,01:23
6,0.084442,0.081775,0.033803,0.966250,0.922267,01:21
7,0.084105,0.083198,0.027230,0.972802,0.937324,01:26
8,0.085765,0.070628,0.025352,0.974658,0.941538,01:26
9,0.068428,0.071783,0.025352,0.974678,0.941648,01:26


Better model found at epoch 0 with val_loss value: 0.09799405932426453.
Better model found at epoch 5 with val_loss value: 0.0882440134882927.
Better model found at epoch 6 with val_loss value: 0.0817754790186882.
Better model found at epoch 8 with val_loss value: 0.07062783092260361.


In [44]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Test, scale=1.)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)

submission = [0 for i in range(y_pred.shape[0])]

for fn, y in zip(learn.data.test_dl.items, np.argmax(y_pred[:, [1,0]], axis=1)):
    index = int(fn.name.replace(".bmp", '')) - 1
    submission[index] = y
    
f1_score(gt_labels, submission, average='weighted')

0.8276056258495332

In [45]:
size = 450
bs = 16
learn.data = get_data(bs, size)

learn.freeze()
lr = 1e-2
learn.fit_one_cycle(5, lr)

learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,lr/5))

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.274634,56.262306,0.203756,0.799464,0.549272,01:46
1,0.268295,0.361244,0.133333,0.866560,0.691798,01:44
2,0.245222,0.409132,0.088263,0.910664,0.793209,01:44
3,0.170512,0.185284,0.069484,0.928589,0.839236,01:44
4,0.148739,0.148212,0.046948,0.952345,0.891209,01:44


Better model found at epoch 0 with val_loss value: 56.262306213378906.
Better model found at epoch 1 with val_loss value: 0.3612443804740906.
Better model found at epoch 3 with val_loss value: 0.1852840632200241.
Better model found at epoch 4 with val_loss value: 0.14821158349514008.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.150865,0.123339,0.050704,0.948380,0.882697,02:02
1,0.147849,0.252645,0.060094,0.938307,0.861853,02:01
2,0.151301,0.160575,0.055399,0.943457,0.871891,02:00
3,0.239783,0.228781,0.071362,0.928346,0.834189,01:59
4,0.156537,0.120652,0.041315,0.958336,0.903993,02:02
5,0.150757,0.141288,0.044131,0.955353,0.897530,02:02
6,0.152627,0.132429,0.050704,0.948328,0.882877,02:00
7,0.100508,0.117719,0.037559,0.962020,0.913009,01:59
8,0.106819,0.106255,0.033803,0.965911,0.921626,01:59
9,0.100673,0.102616,0.032864,0.966902,0.923763,02:00


Better model found at epoch 0 with val_loss value: 0.12333854287862778.
Better model found at epoch 4 with val_loss value: 0.12065206468105316.
Better model found at epoch 7 with val_loss value: 0.117718905210495.
Better model found at epoch 8 with val_loss value: 0.10625522583723068.
Better model found at epoch 9 with val_loss value: 0.10261551290750504.


In [46]:
y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Test, scale=1.)#, beta=0.4, scale=1.3
y_pred = to_np(y_pred)

In [47]:
submission = [0 for i in range(y_pred.shape[0])]

for fn, y in zip(learn.data.test_dl.items, np.argmax(y_pred[:, [1,0]], axis=1)):
    index = int(fn.name.replace(".bmp", '')) - 1
    submission[index] = y
    
f1_score(gt_labels, submission, average='weighted')

0.7919132298033592