In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

In [3]:
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *

In [4]:
from dataset_spliter import SplitByPatient
from metrics import *#F1Weighted, MCC
from losses import *

In [5]:
import pretrainedmodels

In [6]:
path = Path('/data/Datasets/WhiteBloodCancer/train/')
test_path = Path('/data/Datasets/WhiteBloodCancer/C-NMC_test_final_phase_data/')

In [7]:
np.random.seed(42)

In [8]:
fnames = get_image_files(path, recurse=True)
fnames[:5]

[PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_43_1_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H22_31_15_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_9_11_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H14_28_6_hem.bmp'),
 PosixPath('/data/Datasets/WhiteBloodCancer/train/fold_1/hem/UID_H10_189_1_hem.bmp')]

In [9]:
len(fnames)

12528

In [10]:
#train_files_regex = re.compile(r'(fold_0|fold_1|fold_2)')

#fnames = [fn for fn in fnames if train_files_regex.search(str(fn)) is not None]
#len(fnames)

In [11]:
hem_regex = re.compile(r'UID_(H[0-9]+)_', re.IGNORECASE)
all_regex = re.compile(r'UID_([0-9]+)_', re.IGNORECASE)

In [12]:
hem_patient_ids = list(set([hem_regex.search(str(fn)).group(1)
                            for fn in fnames if hem_regex.search(str(fn)) is not None]))
all_patint_ids = list(set([all_regex.search(str(fn)).group(1)
                           for fn in fnames if all_regex.search(str(fn)) is not None]))

hem_patients = dict((k,[]) for k in hem_patient_ids)
all_patints = dict((k,[]) for k in all_patint_ids)

[all_patints[key].append(fn) for key in all_patints.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
[hem_patients[key].append(fn) for key in hem_patients.keys() for fn in fnames if 'UID_{0}_'.format(key) in str(fn)]
print()




## Split data into train val 

In [13]:
split_handler = SplitByPatient(hem_patients, all_patints)

In [14]:
pat = re.compile(r'^.*(hem|all).bmp$')

def get_label(fn):
    return pat.search(str(fn)).group(1)

#### Data augmentation

In [15]:
xtra_tfms=[cutout(n_holes=5, length=0.2)]#squish(scale=0.66), 
tfms = get_transforms(do_flip=True, 
                      flip_vert=True, 
                      #max_rotate=90,  
                      max_lighting=0., 
                      max_zoom=1.0, 
                      max_warp=0.0,
                      #p_affine=0.75,
                      #p_lighting=0.75,  
                      #xtra_tfms=xtra_tfms,
                     )

#### Create dataset 

In [16]:
def get_data(bs, size, train_index, test_index):

    lls = ImageList(fnames, path=path)\
        .split_by_idxs(train_index, test_index)\
        .label_from_func(get_label).add_test_folder('../C-NMC_test_final_phase_data')
    data = ImageDataBunch.create_from_ll(lls, size=size, bs=bs, ds_tfms=tfms).normalize()
    
    return data    

In [17]:
y_pred_list = {}

In [19]:
idx = 0
for i in tqdm(range(10)):
    train, val, keys = split_handler.split_by_num_patients(fnames)
    
    for train_index, test_index in zip(train, val):
        size = 256
        bs = 96

        data = get_data(bs, size, train_index, test_index)

        gc.collect()
        
        experiment_name = "baseline_resnet18_pat_final_{}".format(i)
        learn = create_cnn(data, models.resnet18, 
                    #cut=-2,
                       metrics=[error_rate, F1Weighted(), MCC()], #  
                       #loss_func=FocalLoss(num_classes=1),
                       #ps=0.75,
                       #wd=0.1,
                       loss_func = LabelSmoothingCrossEntropy(),
                       callback_fns=[partial(SaveModelCallback, name='stage1-{}-{}'.format(experiment_name, size))],

                  )#
        
        for size, bs in [[256, 96], [384, 32]]:
            learn.data = get_data(bs, size, train_index, test_index)
            learn.freeze()
            
            lr = 1e-2
            learn.fit_one_cycle(5, lr)

            learn.unfreeze()
            learn.fit_one_cycle(10, slice(1e-5,lr/5))
            
            y_pred, y_test_tta = learn.TTA(ds_type=DatasetType.Test, scale=1.15)#, beta=0.4, scale=1.3
            y_pred = to_np(y_pred)
             
            y_pred_list["{}-{}-{}".format(i, size, idx)] = y_pred
            idx += 1

100%|██████████| 10/10 [14:20:46<00:00, 5164.62s/it] 


In [19]:
result

{'0-256-0': 0.7995936902072432,
 '0-384-1': 0.8489862238092865,
 '0-450-2': 0.8723428478349992,
 '0-256-3': 0.8285340015124422,
 '0-384-4': 0.8427879157212518,
 '0-450-5': 0.8399238883700976,
 '1-256-6': 0.8069987210387433,
 '1-384-7': 0.8684544697155888,
 '1-450-8': 0.8047994689709497,
 '1-256-9': 0.8164101675144312,
 '1-384-10': 0.8535520075158467,
 '1-450-11': 0.8357555006644669,
 '2-256-12': 0.7300292909503615,
 '2-384-13': 0.8068206427092612,
 '2-450-14': 0.8434720235986691,
 '2-256-15': 0.8106300441395914,
 '2-384-16': 0.848887224231825,
 '2-450-17': 0.8408258026250278,
 '3-256-18': 0.839548445830632,
 '3-384-19': 0.8669657242412542,
 '3-450-20': 0.8655365552040435,
 '3-256-21': 0.8339658241852698,
 '3-384-22': 0.8392564522228275,
 '3-450-23': 0.8347522587059969,
 '4-256-24': 0.8119369623223438,
 '4-384-25': 0.8376105149695994,
 '4-450-26': 0.7508485326251219,
 '4-256-27': 0.7976507732153674,
 '4-384-28': 0.8604458725984706,
 '4-450-29': 0.8494024906817047,
 '5-256-30': 0.8214393

In [20]:
[y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y]

[array([ 1.17305 , -1.078827, -0.587471,  1.059145, ..., -0.986551,  1.055845, -0.27743 ,  1.103063], dtype=float32),
 array([ 1.034356, -0.880579, -0.42716 ,  0.691032, ..., -0.930529,  0.967228, -0.505835,  1.035036], dtype=float32),
 array([ 1.094852, -0.924338, -0.736209,  0.893698, ..., -1.137465,  1.004641,  0.393359,  1.13653 ], dtype=float32),
 array([ 0.881661, -0.880843, -0.504006,  0.735202, ..., -0.942648,  0.572962, -0.528113,  0.902582], dtype=float32),
 array([ 1.257224, -0.653726, -0.867302,  1.052966, ..., -1.34411 ,  1.09541 , -0.314992,  1.411245], dtype=float32),
 array([ 1.189002,  0.401337, -0.707318,  0.722549, ..., -1.271924,  1.204975,  0.531803,  1.182332], dtype=float32),
 array([ 0.974816, -0.470859, -0.415062,  0.875301, ..., -0.956167,  1.007345,  0.574717,  1.06136 ], dtype=float32),
 array([ 1.165715, -0.34995 , -0.877867,  1.167225, ..., -1.069686,  1.101481,  0.044122,  1.132237], dtype=float32),
 array([ 1.406365, -0.331139, -0.248698,  1.16936 , ...,

In [21]:
len([y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y])

20

In [22]:
models = len([y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y])
submission = [0 for i in range(2586)]
for i in range(models):
    ALL = [y_pred_list[y][:, 0] for y in y_pred_list if "-384" in y][i]
    normal = [y_pred_list[y][:, 1] for y in y_pred_list if "-384" in y][i]


    for fn, a, normal in zip(learn.data.test_dl.items, ALL, normal):
        index = int(fn.name.replace(".bmp", '')) - 1
        submission[index] += 1 if a > normal else 0

In [23]:
submission

[20,
 0,
 0,
 20,
 20,
 20,
 20,
 19,
 20,
 0,
 5,
 9,
 15,
 17,
 0,
 9,
 0,
 20,
 20,
 0,
 0,
 2,
 11,
 0,
 16,
 2,
 1,
 20,
 20,
 20,
 0,
 11,
 19,
 1,
 0,
 0,
 20,
 20,
 0,
 0,
 0,
 0,
 11,
 20,
 20,
 0,
 0,
 0,
 20,
 4,
 18,
 20,
 19,
 1,
 18,
 3,
 13,
 12,
 0,
 6,
 0,
 0,
 20,
 20,
 0,
 20,
 19,
 14,
 0,
 2,
 1,
 6,
 20,
 0,
 10,
 18,
 1,
 20,
 1,
 20,
 1,
 0,
 4,
 18,
 18,
 20,
 0,
 0,
 2,
 20,
 0,
 18,
 3,
 18,
 20,
 20,
 0,
 20,
 3,
 19,
 0,
 6,
 1,
 0,
 20,
 20,
 20,
 15,
 0,
 3,
 0,
 20,
 0,
 0,
 0,
 0,
 16,
 19,
 16,
 1,
 20,
 20,
 20,
 20,
 0,
 5,
 2,
 20,
 5,
 14,
 20,
 13,
 0,
 20,
 10,
 0,
 9,
 0,
 9,
 17,
 0,
 20,
 6,
 20,
 20,
 20,
 20,
 13,
 0,
 20,
 20,
 11,
 0,
 16,
 20,
 9,
 0,
 20,
 0,
 18,
 20,
 20,
 20,
 0,
 19,
 20,
 20,
 0,
 0,
 1,
 4,
 17,
 20,
 0,
 1,
 0,
 1,
 4,
 20,
 20,
 2,
 20,
 3,
 2,
 10,
 17,
 16,
 0,
 0,
 16,
 0,
 20,
 0,
 0,
 20,
 0,
 7,
 19,
 20,
 2,
 20,
 0,
 19,
 20,
 20,
 20,
 1,
 20,
 20,
 20,
 20,
 20,
 16,
 0,
 6,
 0,
 0,
 15,
 20,
 18,
 1,
 

In [45]:
np.histogram(submission, bins=20)

(array([733, 141,  65,  55,  46,  37,  49,  21,  37,  29,  34,  32,  28,  38,  46,  35,  55,  46,  85, 974]),
 array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20.]))

In [34]:
submission_final = (np.array(submission) > 9).astype(np.int)

In [30]:
len((np.array(submission) > 10).astype(np.int))

2586

In [48]:
for i in range(20):
    print("{}  {}".format(i, np.count_nonzero((np.array(submission) >= i).astype(np.int))))

0  2586
1  1853
2  1712
3  1647
4  1592
5  1546
6  1509
7  1460
8  1439
9  1402
10  1373
11  1339
12  1307
13  1279
14  1241
15  1195
16  1160
17  1105
18  1059
19  974


In [35]:
with open('isbi_valid.predict', 'w') as f:
    for item in submission_final:
        f.write("{}\n".format(item))

In [50]:
learn.export("Final_rn-18-Patient.pkl")