In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

In [3]:
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *

In [4]:
from dataset_spliter import SplitByPatient
from metrics import *#F1Weighted, MCC
from losses import *

In [5]:
import pretrainedmodels

In [6]:
path = Path('/data/Datasets/WhiteBloodCancer/train/')
test_path = Path('/data/Datasets/WhiteBloodCancer/test/')

In [7]:
np.random.seed(42)

In [8]:
fnames = get_image_files(path, recurse=True)
fnames[:5]

[PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_43_1_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H22_31_15_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_9_11_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_28_6_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_189_1_hem.bmp')]

In [9]:
len(fnames)

12528

In [10]:
train_files_regex = re.compile(r'(fold_0|fold_1|fold_2)')

fnames = [fn for fn in fnames if train_files_regex.search(str(fn)) is not None]
len(fnames)

10661

In [11]:
hem_regex = re.compile(r'UID_(H[0-9]+)_', re.IGNORECASE)
all_regex = re.compile(r'UID_([0-9]+)_', re.IGNORECASE)

In [12]:
hem_patient_ids = list(set([hem_regex.search(str(fn)).group(1)
                            for fn in fnames if hem_regex.search(str(fn)) is not None]))
all_patint_ids = list(set([all_regex.search(str(fn)).group(1)
                           for fn in fnames if all_regex.search(str(fn)) is not None]))

hem_patients = dict((k,[]) for k in hem_patient_ids)
all_patints = dict((k,[]) for k in all_patint_ids)

[all_patints[key].append(fn) for key in all_patints.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
[hem_patients[key].append(fn) for key in hem_patients.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
print()




## Split data into train val 

In [13]:
split_handler = SplitByPatient(hem_patients, all_patints)

In [14]:
pat = re.compile(r'^.*(hem|all).bmp$')

def get_label(fn):
    return pat.search(str(fn)).group(1)

#### Data augmentation

In [15]:
xtra_tfms=[cutout(n_holes=5, length=0.2)]#squish(scale=0.66), 
tfms = get_transforms(do_flip=True, 
                      flip_vert=True, 
                      #max_rotate=90,  
                      max_lighting=0., 
                      max_zoom=1.0, 
                      max_warp=0.0,
                      #p_affine=0.75,
                      #p_lighting=0.75,  
                      #xtra_tfms=xtra_tfms,
                     )

#### Create dataset 

In [16]:
import pandas as pd

file = "/server/born_pix/EPA_DATASETS/WhiteBloodCancer/VAL_ISBI_labelfile_Source_reference_prediction.csv"
dataset = pd.read_csv(file, delimiter=';')
gt_labels = np.array(dataset.loc[:, 'labels'])

In [17]:
def get_data(bs, size, train_index, test_index):

    lls = ImageList(fnames, path=path)\
        .split_by_idxs(train_index, test_index)\
        .label_from_func(get_label).add_test_folder('../test')
    data = ImageDataBunch.create_from_ll(lls, size=size, bs=bs, ds_tfms=tfms).normalize()
    
    return data    

In [18]:
result = {}
y_pred_list = {}
idx = 0
for i in tqdm(range(10)):
    print(result)
    train, val, keys = split_handler.split_by_num_patients(fnames)
    
    for train_index, test_index in zip(train, val):
        size = 256
        bs = 96

        data = get_data(bs, size, train_index, test_index)

        gc.collect()
        
        experiment_name = "baseline_resnet18_pat_{}".format(i)
        learn = create_cnn(data, models.resnet18, 
                    #cut=-2,
                       metrics=[error_rate, F1Weighted(), MCC()], #  
                       #loss_func=FocalLoss(num_classes=1),
                       #ps=0.75,
                       #wd=0.1,
                       loss_func = LabelSmoothingCrossEntropy(),
                       callback_fns=[partial(SaveModelCallback, name='stage1-{}-{}'.format(experiment_name, size))],

                  )#
        
        for size, bs in [[256, 96], [384, 32], [450, 16]]:
            learn.data = get_data(bs, size, train_index, test_index)
            learn.freeze()
            
            lr = 1e-2
            learn.fit_one_cycle(5, lr)

            learn.unfreeze()
            learn.fit_one_cycle(10, slice(1e-5,lr/5))
            
            y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Test, scale=1.15)#, beta=0.4, scale=1.3
            y_pred = to_np(y_pred)
            
            submission = [0 for i in range(y_pred.shape[0])]
            for fn, y in zip(learn.data.test_dl.items, np.argmax(y_pred[:, [1,0]], axis=1)):
                index = int(fn.name.replace(".bmp", '')) - 1
                submission[index] = y

            score = f1_score(gt_labels, submission, average='weighted')
            result["{}-{}-{}".format(i, size, idx)] = score  
            y_pred_list["{}-{}-{}".format(i, size, idx)] = y_pred
            idx += 1

epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.419453,0.480169,0.084577,0.914025,0.826759,01:35
1,0.451287,0.454713,0.100498,0.898600,0.790382,01:35
2,0.427822,0.445488,0.072637,0.927844,0.854876,01:35
3,0.392716,0.487338,0.106468,0.890547,0.785477,01:35
4,0.383485,0.439962,0.079602,0.919163,0.836960,01:35


Better model found at epoch 0 with val_loss value: 0.48016855120658875.
Better model found at epoch 1 with val_loss value: 0.4547134041786194.
Better model found at epoch 2 with val_loss value: 0.4454878568649292.
Better model found at epoch 4 with val_loss value: 0.4399624168872833.


epoch,train_loss,valid_loss,error_rate,f1_weighted,mcc,time
0,0.391645,0.433313,0.081592,0.917652,0.830455,02:02
1,0.417613,0.411563,0.067662,0.932679,0.862678,02:01
2,0.400364,0.554820,0.112438,0.885384,0.767988,02:02
3,0.409757,0.474728,0.122388,0.873076,0.755930,02:02
4,0.405241,0.448819,0.066667,0.932696,0.862147,02:01


Better model found at epoch 0 with val_loss value: 0.4333128333091736.
Better model found at epoch 1 with val_loss value: 0.41156280040740967.


KeyboardInterrupt: 

In [19]:
result

{'0-256-0': 0.7995936902072432,
 '0-384-1': 0.8489862238092865,
 '0-450-2': 0.8723428478349992,
 '0-256-3': 0.8285340015124422,
 '0-384-4': 0.8427879157212518,
 '0-450-5': 0.8399238883700976,
 '1-256-6': 0.8069987210387433,
 '1-384-7': 0.8684544697155888,
 '1-450-8': 0.8047994689709497,
 '1-256-9': 0.8164101675144312,
 '1-384-10': 0.8535520075158467,
 '1-450-11': 0.8357555006644669,
 '2-256-12': 0.7300292909503615,
 '2-384-13': 0.8068206427092612,
 '2-450-14': 0.8434720235986691,
 '2-256-15': 0.8106300441395914,
 '2-384-16': 0.848887224231825,
 '2-450-17': 0.8408258026250278,
 '3-256-18': 0.839548445830632,
 '3-384-19': 0.8669657242412542,
 '3-450-20': 0.8655365552040435,
 '3-256-21': 0.8339658241852698,
 '3-384-22': 0.8392564522228275,
 '3-450-23': 0.8347522587059969,
 '4-256-24': 0.8119369623223438,
 '4-384-25': 0.8376105149695994,
 '4-450-26': 0.7508485326251219,
 '4-256-27': 0.7976507732153674,
 '4-384-28': 0.8604458725984706,
 '4-450-29': 0.8494024906817047,
 '5-256-30': 0.8214393

In [20]:
[y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y]

[array([ 1.268102,  1.160591,  1.209893,  0.201621, ..., -0.785243,  1.003542,  0.697981,  1.015688], dtype=float32),
 array([ 1.286036,  1.269605,  1.257903, -0.28675 , ..., -1.217229,  0.938857, -0.150239,  0.339273], dtype=float32),
 array([ 1.130489,  1.219871,  0.94832 , -0.172522, ..., -1.146649,  0.922788,  0.021436,  0.86742 ], dtype=float32),
 array([ 1.164097,  1.068487,  0.748056, -0.374015, ..., -1.178558,  0.909217,  0.641134,  0.845949], dtype=float32),
 array([ 0.809478,  0.928793,  0.53833 ,  0.233227, ..., -0.486744,  0.725852, -0.12288 ,  0.160403], dtype=float32),
 array([ 1.236744,  1.296595,  1.120246,  0.493771, ..., -0.729537,  1.126822,  3.166527,  0.968362], dtype=float32),
 array([ 1.055087,  1.105675,  0.971905, -0.298365, ..., -1.214161,  0.905849, -1.255883,  1.06277 ], dtype=float32),
 array([ 1.203771,  1.181524,  1.083446, -0.127713, ..., -1.253654,  1.119643, -0.308445,  0.651389], dtype=float32),
 array([ 1.145688,  1.10831 ,  0.92883 ,  0.339994, ...,

In [22]:
len([y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y])

17

In [33]:
models = 10#len([y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y])
submission = [0 for i in range(1867)]
for i in range(models):
    ALL = [y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y][i]
    normal = [y_pred_list[y][:, 1] for y in y_pred_list if "-384" in y][i]


    for fn, a, normal in zip(learn.data.test_dl.items, ALL, normal):
        index = int(fn.name.replace(".bmp", '')) - 1
        submission[index] += 1 if a > normal else 0
        
for i in range(models):
    print("{} {}".format(i, f1_score(gt_labels, (np.array(submission) > i).astype(np.int), average='weighted')))

0 0.7485627492375221
1 0.8197842253357601
2 0.8474963752179598
3 0.8728306520378393
4 0.8861094008426075
5 0.8825064520297676
6 0.8756334572862109
7 0.8663064892338689
8 0.850240979426935
9 0.8039284581596575
