In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

In [3]:
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *

In [4]:
from dataset_spliter import SplitByPatient
from metrics import *#F1Weighted, MCC
from losses import *

In [5]:
import pretrainedmodels

In [6]:
path = Path('/data/Datasets/WhiteBloodCancer/train/')
test_path = Path('/data/Datasets/WhiteBloodCancer/test/')

In [7]:
np.random.seed(42)

In [8]:
fnames = get_image_files(path, recurse=True)
fnames[:5]

[PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_43_1_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H22_31_15_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_9_11_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_28_6_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_189_1_hem.bmp')]

In [9]:
len(fnames)

12528

In [10]:
train_files_regex = re.compile(r'(fold_0|fold_1|fold_2)')

fnames = [fn for fn in fnames if train_files_regex.search(str(fn)) is not None]
len(fnames)

10661

In [11]:
hem_regex = re.compile(r'UID_(H[0-9]+)_', re.IGNORECASE)
all_regex = re.compile(r'UID_([0-9]+)_', re.IGNORECASE)

In [12]:
hem_patient_ids = list(set([hem_regex.search(str(fn)).group(1)
                            for fn in fnames if hem_regex.search(str(fn)) is not None]))
all_patint_ids = list(set([all_regex.search(str(fn)).group(1)
                           for fn in fnames if all_regex.search(str(fn)) is not None]))

hem_patients = dict((k,[]) for k in hem_patient_ids)
all_patints = dict((k,[]) for k in all_patint_ids)

[all_patints[key].append(fn) for key in all_patints.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
[hem_patients[key].append(fn) for key in hem_patients.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
print()




## Split data into train val 

In [13]:
split_handler = SplitByPatient(hem_patients, all_patints)

In [14]:
folds = split_handler.split_by_folds(10)

In [15]:
pat = re.compile(r'^.*(hem|all).bmp$')

def get_label(fn):
    return pat.search(str(fn)).group(1)

#### Data augmentation

In [16]:
xtra_tfms=[cutout(n_holes=5, length=0.2)]#squish(scale=0.66), 
tfms = get_transforms(do_flip=True, 
                      flip_vert=True, 
                      #max_rotate=90,  
                      max_lighting=0., 
                      max_zoom=1.0, 
                      max_warp=0.0,
                      #p_affine=0.75,
                      #p_lighting=0.75,  
                      #xtra_tfms=xtra_tfms,
                     )

#### Create dataset 

In [17]:
import pandas as pd

file = "/server/born_pix/EPA_DATASETS/WhiteBloodCancer/VAL_ISBI_labelfile_Source_reference_prediction.csv"
dataset = pd.read_csv(file, delimiter=';')
gt_labels = np.array(dataset.loc[:, 'labels'])

In [18]:
def get_data(bs, size, train, val):
    
    train_il = ImageList(train) #optinal scale up classes 
    valid_il = ImageList(val)
    item_list = ItemLists(path, train_il, valid_il)
    lls = item_list.label_from_func(get_label).add_test_folder('../test')
    
    data  = ImageDataBunch.create_from_ll(lls, size=size, bs=bs, 
                                      ds_tfms=tfms)
    data = data.normalize()
    #data = data.normalize((channel_mean, channel_std))
    return data

In [24]:
result = {}
y_pred_list = {}
folds_nr = 10
folds = split_handler.split_by_folds(folds_nr)

for i in tqdm(range(folds_nr)):
    print(result)
    
    val_files = folds[i]
    train_files = []
    for sub in range(folds_nr):
        if sub != i:
            train_files.extend(folds[sub])
            
    size = 256
    bs = 96

    data = get_data(bs, size, train_files, val_files)
    
    
    gc.collect()
        
    experiment_name = "baseline_resnet18_fold_{}".format(i)
    learn = create_cnn(data, models.resnet18, 
                    #cut=-2,
                       metrics=[error_rate, F1Weighted(), MCC()], #  
                       #loss_func=FocalLoss(num_classes=1),
                       #ps=0.75,
                       #wd=0.1,
                       loss_func = LabelSmoothingCrossEntropy(),
                       callback_fns=[partial(SaveModelCallback, name='stage1-{}-{}'.format(experiment_name, size))],

                  )#

    for size, bs in [[256, 96], [384, 32], [450, 16]]:
        learn.data = get_data(bs, size, train_files, val_files)
        learn.freeze()
            
        lr = 1e-2
        learn.fit_one_cycle(5, lr)

        learn.unfreeze()
        learn.fit_one_cycle(10, slice(1e-5,lr/5))
            
        y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Test, scale=1.15)#, beta=0.4, scale=1.3
        y_pred = to_np(y_pred)
            
        submission = [0 for i in range(y_pred.shape[0])]
        for fn, y in zip(learn.data.test_dl.items, np.argmax(y_pred[:, [1,0]], axis=1)):
            index = int(fn.name.replace(".bmp", '')) - 1
            submission[index] = y

        score = f1_score(gt_labels, submission, average='weighted')
        result["{}-{}".format(i, size)] = score  
        y_pred_list["{}-{}".format(i, size)] = y_pred


100%|██████████| 10/10 [9:22:50<00:00, 3377.10s/it][A
[A

In [25]:
result

{'0-256': 0.8370529829806416,
 '0-384': 0.8697258190762203,
 '0-450': 0.8308378050691472,
 '1-256': 0.675588458089577,
 '1-384': 0.8102829401712479,
 '1-450': 0.7670019893147089,
 '2-256': 0.7842377471624046,
 '2-384': 0.8468194837889288,
 '2-450': 0.8346422983651186,
 '3-256': 0.8428357659664729,
 '3-384': 0.8505368598972092,
 '3-450': 0.8018243211849668,
 '4-256': 0.8359292104494735,
 '4-384': 0.8508969329349557,
 '4-450': 0.8267653269462155,
 '5-256': 0.8461245726645835,
 '5-384': 0.826759441936637,
 '5-450': 0.8085176343645681,
 '6-256': 0.8305478736826435,
 '6-384': 0.8308788300680963,
 '6-450': 0.8121659542826831,
 '7-256': 0.806590954702244,
 '7-384': 0.8056751765550001,
 '7-450': 0.835070821239372,
 '8-256': 0.761617375800744,
 '8-384': 0.7905933536770098,
 '8-450': 0.8574544200942865,
 '9-256': 0.8341006845829522,
 '9-384': 0.8640455099266026,
 '9-450': 0.8064825678382013}

In [29]:
[y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y]

[array([ 1.152336,  1.198438,  1.158704, -0.355514, ..., -1.079647,  1.059237,  0.979079,  0.800436], dtype=float32),
 array([ 1.122132,  1.057638,  1.039858, -0.659149, ..., -1.363052,  0.95965 ,  0.675043,  0.402386], dtype=float32),
 array([ 1.069058,  0.936479,  1.034025,  0.319316, ..., -1.318618,  1.008389, 11.391863,  0.903609], dtype=float32),
 array([ 1.022039,  1.085352,  0.77216 ,  0.403385, ..., -0.979829,  0.646975,  1.188286,  0.892357], dtype=float32),
 array([ 1.156031,  1.0662  ,  1.103018,  0.418935, ..., -0.641226,  1.201954, -0.062556,  0.783939], dtype=float32),
 array([ 1.059199,  1.044363,  0.960469,  0.723418, ..., -1.05967 ,  0.716692,  0.704864,  0.804772], dtype=float32),
 array([ 1.182051,  1.221186,  0.861363, -0.605115, ..., -1.481576,  0.330372,  1.02202 ,  1.022837], dtype=float32),
 array([ 0.502023,  0.97385 ,  0.47103 , -0.887582, ..., -0.787231,  0.457213,  1.099175,  0.84196 ], dtype=float32),
 array([ 1.132401,  0.926239,  0.963534, -0.116692, ...,

In [37]:
[y_pred_list[y][:, 1] for y in y_pred_list if "-384" in y]

[array([-1.107084, -1.047893, -1.065722,  0.412624, ...,  0.942149, -1.018772, -1.052707, -0.808027], dtype=float32),
 array([-1.134725, -1.203874, -1.154676,  0.285788, ...,  1.076863, -1.039023, -0.853121, -0.235551], dtype=float32),
 array([-1.277142, -1.189646, -1.201974, -0.546039, ...,  0.970671, -1.158995,  0.956551, -0.765684], dtype=float32),
 array([-1.020854, -1.195745, -0.684705, -0.31572 , ...,  1.255536, -0.52855 , -1.236445, -0.948467], dtype=float32),
 array([-1.122465, -1.047056, -1.007839, -0.342365, ...,  0.623645, -1.119219,  0.027102, -0.96882 ], dtype=float32),
 array([-1.178167, -1.229694, -1.023287, -0.733979, ...,  1.164055, -0.792297, -0.98898 , -1.109339], dtype=float32),
 array([-0.912006, -0.721835, -0.678884,  1.107955, ...,  1.46137 , -0.147097, -0.804596, -0.76707 ], dtype=float32),
 array([-0.619791, -1.068007, -0.656412,  0.635198, ...,  1.04207 , -0.572898, -0.999246, -0.906741], dtype=float32),
 array([-1.157214, -1.114029, -1.139676, -0.247202, ...,

In [64]:
submission = [0 for i in range(1867)]
for i in range(10):
    ALL = [y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y][i]
    normal = [y_pred_list[y][:, 1] for y in y_pred_list if "-384" in y][i]


    for fn, a, normal in zip(learn.data.test_dl.items, ALL, normal):
        index = int(fn.name.replace(".bmp", '')) - 1
        submission[index] += 1 if a > normal else 0
        
for i in range(10):
    print("{} {}".format(i, f1_score(gt_labels, (np.array(submission) > i).astype(np.int), average='weighted')))

0 0.7412660331221728
1 0.7900103434473054
2 0.8212678492194531
3 0.8523867668662103
4 0.8692954702557585
5 0.8743362381917729
6 0.8744958698758001
7 0.8668204923992221
8 0.8508937482520892
9 0.779906623377701
