In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

In [3]:
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *

In [4]:
from dataset_spliter import SplitByPatient
from metrics import *#F1Weighted, MCC
from losses import *

In [5]:
import pretrainedmodels

In [6]:
path = Path('/data/Datasets/WhiteBloodCancer/train/')
test_path = Path('/data/Datasets/WhiteBloodCancer/test/')

In [7]:
np.random.seed(42)

In [8]:
fnames = get_image_files(path, recurse=True)
fnames[:5]

[PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_43_1_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H22_31_15_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_9_11_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_28_6_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_189_1_hem.bmp')]

In [9]:
len(fnames)

12528

In [10]:
#train_files_regex = re.compile(r'(fold_0|fold_1|fold_2)')

#fnames = [fn for fn in fnames if train_files_regex.search(str(fn)) is not None]
#len(fnames)

In [11]:
hem_regex = re.compile(r'UID_(H[0-9]+)_', re.IGNORECASE)
all_regex = re.compile(r'UID_([0-9]+)_', re.IGNORECASE)

In [12]:
hem_patient_ids = list(set([hem_regex.search(str(fn)).group(1)
                            for fn in fnames if hem_regex.search(str(fn)) is not None]))
all_patint_ids = list(set([all_regex.search(str(fn)).group(1)
                           for fn in fnames if all_regex.search(str(fn)) is not None]))

hem_patients = dict((k,[]) for k in hem_patient_ids)
all_patints = dict((k,[]) for k in all_patint_ids)

[all_patints[key].append(fn) for key in all_patints.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
[hem_patients[key].append(fn) for key in hem_patients.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
print()




## Split data into train val 

In [13]:
split_handler = SplitByPatient(hem_patients, all_patints)

In [14]:
folds = split_handler.split_by_folds(10)

In [15]:
pat = re.compile(r'^.*(hem|all).bmp$')

def get_label(fn):
    return pat.search(str(fn)).group(1)

#### Data augmentation

In [16]:
xtra_tfms=[cutout(n_holes=5, length=0.2)]#squish(scale=0.66), 
tfms = get_transforms(do_flip=True, 
                      flip_vert=True, 
                      #max_rotate=90,  
                      max_lighting=0., 
                      max_zoom=1.05, 
                      max_warp=0.05,
                      #p_affine=0.75,
                      #p_lighting=0.75,  
                      #xtra_tfms=xtra_tfms,
                     )

#### Create dataset 

In [17]:
def get_data(bs, size, train, val):
    
    train_il = ImageList(train) #optinal scale up classes 
    valid_il = ImageList(val)
    item_list = ItemLists(path, train_il, valid_il)
    lls = item_list.label_from_func(get_label).add_test_folder('../C-NMC_test_final_phase_data')
    
    data  = ImageDataBunch.create_from_ll(lls, size=size, bs=bs, 
                                      ds_tfms=tfms)
    data = data.normalize()
    #data = data.normalize((channel_mean, channel_std))
    return data

In [18]:
y_pred_list = {}

In [20]:
folds_nr = 10
folds = split_handler.split_by_folds(folds_nr)

for i in tqdm(range(folds_nr)):
    
    val_files = folds[i]
    train_files = []
    for sub in range(folds_nr):
        if sub != i:
            train_files.extend(folds[sub])
            
    size = 256
    bs = 96

    data = get_data(bs, size, train_files, val_files)
    
    
    gc.collect()
        
    experiment_name = "baseline_resnet18_fold_final_{}".format(i)
    learn = create_cnn(data, models.resnet18, 
                    #cut=-2,
                       metrics=[error_rate, F1Weighted(), MCC()], #  
                       #loss_func=FocalLoss(num_classes=1),
                       #ps=0.75,
                       #wd=0.1,
                       loss_func = LabelSmoothingCrossEntropy(),
                       callback_fns=[partial(SaveModelCallback, name='stage1-{}-{}'.format(experiment_name, size))],

                  )#

    for size, bs in [[256, 64], [384, 32]]:
        learn.data = get_data(bs, size, train_files, val_files)
        learn.freeze()
            
        lr = 1e-2
        learn.fit_one_cycle(5, lr)
        
        learn.unfreeze()
        learn.fit_one_cycle(10, slice(1e-5,lr/5))
        
        if size == 384:
            learn.metrics = []
            learn.export(experiment_name+"{}.pkl".format(size))
            
        y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Test, scale=1.15)#, beta=0.4, scale=1.3
        y_pred = to_np(y_pred)
            
        y_pred_list["{}-{}".format(i, size)] = y_pred


100%|██████████| 10/10 [7:06:27<00:00, 2558.80s/it][A
[A

In [22]:
[y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y]

[array([ 1.122151, -0.786144, -0.826846,  1.159764, ..., -0.946228,  0.387286, -0.400768,  1.069081], dtype=float32),
 array([ 1.141573, -0.212165, -0.722597,  1.100984, ..., -0.89316 ,  0.953539, -0.950037,  1.186849], dtype=float32),
 array([ 1.095617, -0.833071, -0.76372 ,  1.041327, ..., -0.93292 ,  0.998978,  0.968456,  1.045836], dtype=float32),
 array([ 1.17834 , -0.10271 , -0.845651,  1.103608, ..., -1.049124,  0.991997,  0.214556,  1.152188], dtype=float32),
 array([ 1.232345, -0.527102, -0.731775,  0.733967, ..., -1.014817, -0.154475, -0.068103,  1.196201], dtype=float32),
 array([ 0.950814, -0.838171, -0.750795, -0.018545, ..., -0.853879,  0.952596, -0.465433,  0.893747], dtype=float32),
 array([ 1.096282, -0.073721, -0.360142,  1.21197 , ..., -1.055748,  1.229632,  0.485677,  1.07387 ], dtype=float32),
 array([ 0.989235, -0.966362, -0.713748,  1.094096, ..., -0.973844,  0.731986, -0.694344,  0.872988], dtype=float32),
 array([ 1.000748, -0.409537, -0.669747,  0.980642, ...,

In [None]:
[y_pred_list[y][:, 1] for y in y_pred_list if "-384" in y]

In [23]:
submission = [0 for i in range(2586)]
for i in range(10):
    ALL = [y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y][i]
    normal = [y_pred_list[y][:, 1] for y in y_pred_list if "-384" in y][i]


    for fn, a, normal in zip(learn.data.test_dl.items, ALL, normal):
        index = int(fn.name.replace(".bmp", '')) - 1
        submission[index] += 1 if a > normal else 0        

In [24]:
submission

[10,
 0,
 0,
 10,
 10,
 10,
 10,
 10,
 8,
 0,
 2,
 3,
 5,
 4,
 0,
 10,
 0,
 10,
 10,
 0,
 4,
 0,
 10,
 0,
 10,
 0,
 10,
 10,
 10,
 10,
 0,
 9,
 9,
 0,
 0,
 3,
 10,
 10,
 0,
 0,
 0,
 0,
 9,
 10,
 10,
 0,
 3,
 1,
 9,
 8,
 6,
 9,
 10,
 2,
 8,
 2,
 8,
 9,
 0,
 2,
 0,
 0,
 10,
 7,
 0,
 10,
 10,
 7,
 8,
 7,
 1,
 0,
 10,
 0,
 3,
 2,
 3,
 10,
 6,
 10,
 0,
 0,
 0,
 2,
 4,
 8,
 0,
 0,
 0,
 10,
 0,
 4,
 0,
 9,
 10,
 10,
 3,
 5,
 4,
 10,
 0,
 9,
 0,
 0,
 10,
 10,
 10,
 7,
 2,
 0,
 3,
 10,
 0,
 0,
 0,
 0,
 7,
 10,
 7,
 1,
 10,
 10,
 10,
 10,
 0,
 1,
 0,
 10,
 8,
 8,
 10,
 9,
 0,
 10,
 9,
 0,
 7,
 0,
 0,
 4,
 0,
 10,
 10,
 10,
 10,
 5,
 10,
 9,
 0,
 10,
 10,
 2,
 0,
 9,
 10,
 4,
 2,
 10,
 0,
 10,
 8,
 10,
 10,
 0,
 8,
 10,
 10,
 0,
 0,
 1,
 0,
 1,
 10,
 0,
 0,
 0,
 0,
 0,
 10,
 10,
 0,
 10,
 1,
 2,
 10,
 10,
 5,
 0,
 0,
 7,
 0,
 10,
 0,
 0,
 10,
 0,
 7,
 10,
 10,
 1,
 10,
 7,
 10,
 10,
 10,
 10,
 3,
 10,
 9,
 10,
 9,
 10,
 10,
 0,
 4,
 0,
 0,
 5,
 10,
 2,
 0,
 0,
 10,
 0,
 10,
 10,
 0,
 3,
 9,
 9,
 

In [26]:
submission_final = (np.array(submission) > 4).astype(np.int)

In [27]:
with open('isbi_valid.predict', 'w') as f:
    for item in submission_final:
        f.write("{}\n".format(item))