In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from SlideRunner_dataAccess.database import Database
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import openslide
import pandas as pd
import pyvips
import os
import re

In [4]:
# set directories
root = Path.home() 
data_dir = Path('/data/department/aubreville/datasets/TMAs')
proj_dir = root / 'projects/Cox_AMIL'
database = root / 'projects/Cox_AMIL/tma_databases/Mainz_HE.sqlite'

In [5]:
DB = Database().open(str(database))

# join annoId with classes 
get_cores = 'SELECT Classes.uid, Classes.name, Annotations_label.annoId FROM Classes LEFT JOIN Annotations_label ON Classes.uid=Annotations_label.class'
cores = pd.DataFrame(DB.execute(get_cores).fetchall(), columns=['core_id', 'core_label', 'anno_id'])

# get coordinates
get_coords = 'SELECT annoId, coordinateX, coordinateY, Annotations_coordinates.slide FROM Annotations_coordinates LEFT JOIN Annotations ON Annotations_coordinates.annoId=Annotations.uid WHERE Annotations.deleted=0'
coords = pd.DataFrame(DB.execute(get_coords), columns=['anno_id', 'x', 'y', 'slide_id'])

# join cores and coords
df = pd.merge(cores, coords, how='inner', on='anno_id')

# get slides
get_slides = 'SELECT uid, filename, directory FROM Slides'
slides = pd.DataFrame(DB.execute(get_slides), columns=['slide_id', 'filename', 'dir'])

In [6]:
# ============================================ #
#TODO: Change location and filenames to remove these manual steps!!!

# clean dirs 
slides.dir = [s.replace('\\', '/') for s in slides.dir]

# change dirs
orig_dirs = [s.split('/')[-1] for s in slides.dir]
new_dirs = [str(data_dir) if di == '01' else os.path.join(data_dir, 'Delivery2') for di in orig_dirs]
slides = slides.assign(dir=new_dirs)

# add level 
level = [1 if i == 5 else 0 for i in slides.slide_id]
slides = slides.assign(level=level)

# add patient_id
patient_ids = [int(re.split('P|_', s)[1]) for s in df.core_label]
df = df.assign(patient_id=patient_ids)
# # ============================================ #

In [7]:
df.head()

Unnamed: 0,core_id,core_label,anno_id,x,y,slide_id,patient_id
0,63,P21_3,2.0,63416,72291,2,21
1,63,P21_3,2.0,63670,72291,2,21
2,63,P21_3,2.0,63712,72334,2,21
3,63,P21_3,2.0,63755,72334,2,21
4,63,P21_3,2.0,63797,72376,2,21


In [8]:
slides

Unnamed: 0,slide_id,filename,dir,level
0,1,DigitalSlide_A1M_7S_1.mrxs,/data/department/aubreville/datasets/TMAs/Deli...,0
1,2,706930-3 TMA III.mrxs,/data/department/aubreville/datasets/TMAs,0
2,3,706930-2 TMA VI.mrxs,/data/department/aubreville/datasets/TMAs,0
3,4,DigitalSlide_A1M_6S_1.mrxs,/data/department/aubreville/datasets/TMAs/Deli...,0
4,5,DigitalSlide_A1M_7S_1_1.mrxs,/data/department/aubreville/datasets/TMAs,1
5,6,DigitalSlide_A1M_1S_1.mrxs,/data/department/aubreville/datasets/TMAs/Deli...,0
6,7,DigitalSlide_A1M_2S_1.mrxs,/data/department/aubreville/datasets/TMAs/Deli...,0
7,8,DigitalSlide_A1M_3S_1.mrxs,/data/department/aubreville/datasets/TMAs/Deli...,0
8,9,DigitalSlide_A1M_9S_1.mrxs,/data/department/aubreville/datasets/TMAs/Deli...,0
9,10,DigitalSlide_A1M_10S_1.mrxs,/data/department/aubreville/datasets/TMAs/Deli...,0


## A few sanity checks


In [9]:
import sys
sys.path.append('..')

from tma_utils.tma_utils import check_coreID_range, check_patientID_range

In [10]:
check_patientID_range(slide_id=2, low=21, high=40, df=df)

slide_id 2: True


In [11]:
check_patientID_range(slide_id=3, low=81, high=100, df=df)

slide_id 3: True


In [12]:
check_patientID_range(slide_id=4, low=81, high=100, df=df)

slide_id 4: False


In [13]:
MIN = 2000
MAX = 8000
for slide_id in sorted(df.slide_id.unique()):
    x_range, y_range, too_big, too_small = check_coreID_range(slide_id=slide_id, df=df, min=MIN, max=MAX, verbose=True)
    print('\nSlide {}: median x {}, median y {}\n'.format(slide_id, np.median(x_range), np.median(y_range)))
    print('='*50)
    


Slide 1 with 1 suspicously large core annotations.
Core 177, x_range 8239, y_range 6710

Slide 1 without suspicously small core annotations.

Slide 1: median x 6822.0, median y 6580.0


Slide 2 without suspicously large core annotations.

Slide 2 without suspicously small core annotations.

Slide 2: median x 6903.0, median y 5929.0


Slide 3 without suspicously large core annotations.

Slide 3 without suspicously small core annotations.

Slide 3: median x 6664.0, median y 6291.0


Slide 4 with 1 suspicously large core annotations.
Core 78, x_range 8052, y_range 6413

Slide 4 without suspicously small core annotations.

Slide 4: median x 6897.0, median y 5816.0


Slide 5 with 51 suspicously large core annotations.
Core 63, x_range 12317, y_range 12915
Core 64, x_range 13139, y_range 11796
Core 65, x_range 14035, y_range 11348
Core 66, x_range 11496, y_range 13288
Core 67, x_range 13960, y_range 10601
Core 68, x_range 13736, y_range 10899
Core 69, x_range 11870, y_range 13811
Core 70, x

### Visualize suspicous cases 

In [14]:
from tma_utils.tma_utils import extract_core

def show_core(slide: openslide.OpenSlide, 
              df: pd.DataFrame, 
              level: int=0,
              core_id:int=None, 
              core_name: str=None, 
              figsize: tuple=(10, 10)) -> None:

    core = extract_core(slide, df, level, core_id, core_name)
    
    if core is not None:
        plt.figure(figsize=figsize)
        plt.imshow(core)
        plt.show()
    else:
        pass

In [15]:
for slide_id in df.slide_id.unique():
    x_range, y_range, too_big, too_small = check_coreID_range(slide_id=slide_id, df=df, min=MIN, max=MAX)
    
    slide_fn = slides.dir[slides.slide_id == slide_id].item() + '/' + slides.filename[slides.slide_id == slide_id].item()
    slide_level = slides.level[slides.slide_id == slide_id].item()
    slide_df = df.loc[df.slide_id == slide_id]

    slide = openslide.open_slide(str(slide_fn))

    # if len(too_big) > 0:
    #     for core_id, x_range, y_range in too_big:
    #         print('Slide {}, core {}, width {}, height {}'.format(slide_id, core_id, x_range, y_range))
    #         show_core(slide, slide_df, core_id=core_id, level=slide_level)

    if len(too_small) > 0:
        for core_id, x_range, y_range in too_small:
            print('Slide {}, core {}, width {}, height {}'.format(slide_id, core_id, x_range, y_range))
            show_core(slide, slide_df, core_id=core_id, level=slide_level)
