In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sqlite3
import numpy as np
from SlideRunner.dataAccess.database import Database
from tqdm import tqdm
from pathlib import Path
import openslide
from random import randint
from Detection.data_loader import *
import pickle

In [3]:
path = Path('/data/Datasets/EIPH_WSI/')

database = Database()
database.open(str(path/'EIPH.sqlite'))

files = []
lbl_bbox = []
size = 512

In [4]:
getslides = """SELECT uid, filename FROM Slides"""
for currslide, filename in tqdm(database.execute(getslides).fetchall()):
    database.loadIntoMemory(currslide)

    check = True if 'erliner' in filename else False
    slidetype = 'Berliner Blau/' if check else 'Turnbull Blue/'

    slide_path = path / slidetype / filename

    slide = openslide.open_slide(str(slide_path))
    level = 1#slide.level_count - 1
    level_dimension = slide.level_dimensions[level]
    down_factor = slide.level_downsamples[level]

    classes = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4}
    labels, bboxes = [], []
    for id, annotation in database.annotations.items():
        if annotation.labels[0].classId in classes:
            d = 2 * annotation.r / down_factor
            x_min = (annotation.x1 - annotation.r) / down_factor
            y_min = (annotation.y1 - annotation.r) / down_factor
            x_max = x_min + d
            y_max = y_min + d
            label = classes[annotation.labels[0].classId]

            bboxes.append([int(x_min), int(y_min), int(x_max), int(y_max)])
            labels.append(label)

    if len(bboxes) > 0:
        lbl_bbox.append([bboxes, labels])
        files.append(SlideContainer(slide_path, level, size, size))

  0%|          | 0/24 [00:00<?, ?it/s]

Loading DB into memory ...


  4%|▍         | 1/24 [00:00<00:07,  3.22it/s]

Loading DB into memory ...


  8%|▊         | 2/24 [00:01<00:17,  1.23it/s]

Loading DB into memory ...


 12%|█▎        | 3/24 [00:02<00:14,  1.41it/s]

Loading DB into memory ...


 17%|█▋        | 4/24 [00:02<00:12,  1.60it/s]

Loading DB into memory ...


 21%|██        | 5/24 [00:02<00:10,  1.85it/s]

Loading DB into memory ...
Loading DB into memory ...


 33%|███▎      | 8/24 [00:03<00:06,  2.41it/s]

Loading DB into memory ...
Loading DB into memory ...


 46%|████▌     | 11/24 [00:03<00:04,  3.00it/s]

Loading DB into memory ...
Loading DB into memory ...
Loading DB into memory ...


 50%|█████     | 12/24 [00:03<00:03,  3.02it/s]

Loading DB into memory ...
Loading DB into memory ...
Loading DB into memory ...
Loading DB into memory ...


 71%|███████   | 17/24 [00:04<00:01,  3.56it/s]

Loading DB into memory ...
Loading DB into memory ...


 79%|███████▉  | 19/24 [00:04<00:01,  3.84it/s]

Loading DB into memory ...
Loading DB into memory ...
Loading DB into memory ...


 88%|████████▊ | 21/24 [00:05<00:00,  3.78it/s]

Loading DB into memory ...
Loading DB into memory ...


100%|██████████| 24/24 [00:06<00:00,  3.66it/s]

Loading DB into memory ...





In [6]:
img2bbox = dict(zip(files, np.array(lbl_bbox)))
get_y_func = lambda o:img2bbox[o]
w, h = size, size

num_examples_per_image = 100
train_files = files[4:]
valid_files = files[4:]

In [7]:
def extract_histogram_and_score(file: SlideContainer, boxes, labels, classes, num_examples_per_image):
    image_x, image_y = [], []

    for i in range(num_examples_per_image):
        class_id = np.random.choice(classes, 1)[0]
        ids = labels == class_id
        xmin, ymin, xmax, ymax = np.array(boxes)[ids][randint(0, np.count_nonzero(ids) - 1)]

        x = int(xmin - w / 2)
        y = int(ymin - h / 2)

        # select_boxes
        select_boxes = np.copy(boxes)
        select_boxes[:, [0, 2]] = select_boxes[:, [0, 2]] - x
        select_boxes[:, [1, 3]] = select_boxes[:, [1, 3]] - y

        bb_widths = (select_boxes[:, 2] - select_boxes[:, 0]) / 2
        bb_heights = (select_boxes[:, 3] - select_boxes[:, 1]) / 2

        ids = ((select_boxes[:, 0] + bb_widths) > 0) \
              & ((select_boxes[:, 1] + bb_heights) > 0) \
              & ((select_boxes[:, 2] - bb_widths) < w) \
              & ((select_boxes[:, 3] - bb_heights) < h)

        select_labels = np.copy(labels)[ids]

        patch = file.get_patch(x,y)
        score = np.mean(select_labels)

        histogram = np.concatenate((np.histogram(patch[:,:,0], bins=256)[0],
                                    np.histogram(patch[:,:,1], bins=256)[0],
                                    np.histogram(patch[:,:,2], bins=256)[0]))

        image_x.append(histogram)
        image_y.append(score)

    return image_x, image_y

In [12]:
train_x, train_y = [], []
val_x, val_y = [], []

In [13]:
for file in tqdm(train_files):

    boxes, labels = get_y_func(file)
    boxes = np.array(boxes)
    labels = np.array(labels)
    classes = list(set(labels))

    x, y = extract_histogram_and_score(file,
                                       boxes,
                                       labels,
                                       classes,
                                       num_examples_per_image)
    train_x.extend(x)
    train_y.extend(y)

100%|██████████| 13/13 [01:08<00:00,  5.29s/it]


In [14]:
for file in tqdm(valid_files):

    boxes, labels = get_y_func(file)
    boxes = np.array(boxes)
    labels = np.array(labels)
    classes = list(set(labels))

    x, y = extract_histogram_and_score(file, boxes, labels,classes, num_examples_per_image)
    val_x.extend(x)
    val_y.extend(y)

100%|██████████| 13/13 [01:10<00:00,  5.40s/it]


In [15]:
result = {'train_x': train_x, 'train_y': train_y, 'val_x': val_x, 'val_y': val_y}
pickle.dump( result, open( "train_histo.p", "wb" ) )