# Imports

In [4]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

%reload_ext autoreload
%autoreload 2
#%matplotlib notebook
%matplotlib inline

from typing import List, Callable, Dict

import sys
import os

#https://github.com/FAU-DLM/wsi_processing_pipeline
sys.path.append("../")
sys.path.append("../wsi_processing_pipeline/")
sys.path.append("../wsi_processing_pipeline/tile_extraction")
sys.path.append("../wsi_processing_pipeline/shared")
import wsi_processing_pipeline
import tile_extraction
import preprocessing
import postprocessing
import shared
from wsi_processing_pipeline.tile_extraction import tiles, util, slide, filter
from wsi_processing_pipeline import shared
from wsi_processing_pipeline.preprocessing import *
import wsi_processing_pipeline.preprocessing.files_getter
import wsi_processing_pipeline.preprocessing.name_getter
from wsi_processing_pipeline.preprocessing.name_getter import NameGetter
from wsi_processing_pipeline.preprocessing.tile_image_block import TileImage, label_tl_image, tile_image, TileTransform
from wsi_processing_pipeline.preprocessing.tile_image_block import show_batch, TileImageBlock
from shared.patient_manager import PatientManager
import shared

sys.path.append('../models-pytorch/pretrained-models.pytorch')
import pretrainedmodels
from pretrainedmodels import *


import fastai
from fastai.vision.all import *
from typing import Dict
import pandas as pd
import numpy as np
import os
import torch
torch.backends.cudnn.benchmark=True
import torchvision
from torchvision.models import *
from torchsummary import summary
from functools import partial, update_wrapper
from tqdm import tqdm_notebook as tqdm
import matplotlib.image as mpimg
import shutil
from PIL import Image
import sklearn
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from tqdm.notebook import tqdm

import pathlib
from pathlib import Path
Path.ls = lambda x: [p for p in list(x.iterdir()) if '.ipynb_checkpoints' not in p.name]


from concurrent.futures import as_completed, ProcessPoolExecutor
from tqdm import tqdm

#against DecompressionBombWarning
Image.MAX_IMAGE_PIXELS = 10000000000   

PATH = Path('/home/Deep_Learner/private/datasets/glioblastoma_methylation/')
LABELS = PATH/'labels.xlsx'

METHYLATED_1 = PATH/'methylated'
METHYLATED_2 = PATH/'data2018_19'/'methylated'
NON_METHYLATED_1 = PATH/'non_methylated'
NON_METHYLATED_2 = PATH/'data2018_19'/'non_methylated'



seed = 42
np.random.seed(seed)

# Data

## tile paths

In [2]:
tile_paths=wsi_processing_pipeline.preprocessing.files_getter.FilesGetter().get_dirs_and_files(path=PATH, 
                                                                        get_files=True, 
                                                                        suffix='.png', 
                                                                        recursive=True)

213it [01:26,  2.46it/s]


In [3]:
len(tile_paths)

1938980

## check images

In [5]:
def check_image(path):
    try:
        open_image(path)
    except Exception as e:
        return path
        
    return None

In [None]:
with ProcessPoolExecutor(max_workers=32) as pool:
    with tqdm(total=len(tile_paths)) as progress:
        futures = []

        for p in tile_paths:
            future = pool.submit(check_image, p)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        results = []
        for future in as_completed(futures):
            result = future.result()
            if result is not None:
                results.append(result)

In [None]:
failed = []
for p in tqdm(tile_paths_1819):
    r = check_image(p)
    if r is not None:
        failed.append(r)

## id and label funcs

In [5]:
labels_df = pd.read_excel(LABELS)

In [6]:
labels_df

Unnamed: 0,patient_id,case_id,slide_id,methylation_status,Unnamed: 4
0,Schwemmer-Andreas-1974-03-14,N296-08,N296-08-I,0.0,
1,Schwemmer-Andreas-1974-03-14,N296-08,N296-08-II,0.0,
2,Ballmann-Artur-1934-06-04,N320-08,N320-08,0.0,
3,Ballmann-Artur-1934-06-04,N320-08,N320-08,0.0,
4,Brandt-Ernst-1941-07-14,N608-08,N608-08,0.0,
...,...,...,...,...,...
206,Härtl_Kaczinski-Anita-1943-01-13,N1606-18,N1606-18,1.0,
207,Heinrich-Dieter-1961-12-21,N1416-18,N1416-18,1.0,
208,Herold-Walter-1953-11-08,N1393-18,N1393-18,1.0,
209,Novic-Stanislav_Valentinovych-1974-02-23,N1300-18,N1300-18,1.0,


In [7]:
def get_slide_id_from_parent_folder_name(parent_folder_name:str)->str:
    split = parent_folder_name.split(sep='.')
    if((len(split) == 3) or (len(split) == 2 and 'I' in split[1])):
        return f'{split[0]}-{split[1]}'
    elif(len(split) == 2):
        return split[0]#
    elif(len(split) == 1):
        return parent_folder_name
    else:
        assert False

def get_patient_id_from_parent_folder_name(parent_folder_name:str)->str:
    slide_id = get_slide_id_from_parent_folder_name(parent_folder_name)
    return list(labels_df.loc[labels_df['slide_id'] == slide_id]['patient_id'])[0]

def get_patient_id_from_tile_path(tile_path:pathlib.Path)->str:
    slide_id = get_slide_id_from_parent_folder_name(tile_path.parent.name)
    return list(labels_df.loc[labels_df['slide_id'] == slide_id]['patient_id'])[0]

def get_case_id_from_tile_path(tile_path:pathlib.Path)->str:
    slide_id = get_slide_id_from_parent_folder_name(tile_path.parent.name)
    return list(labels_df.loc[labels_df['slide_id'] == slide_id]['case_id'])[0]

def get_slide_id_from_tile_path(tile_path:pathlib.Path)->str:
    return get_slide_id_from_parent_folder_name(tile_path.parent.name)

def get_label_from_tile_path(tile_path:pathlib.Path)->List[int]:
    if('non_methylated' in str(tile_path)):
        return [0]
    if('methylated' in str(tile_path)):
        return [1]
    assert False

## patient manager

In [8]:
patient_manager = PatientManager()

In [9]:
patient_manager.create_from_preextracted_tiles(tile_paths=tile_paths, 
                                               patient_id_getter=get_patient_id_from_tile_path,
                                               case_id_getter=get_case_id_from_tile_path, 
                                               slide_id_getter=get_slide_id_from_tile_path, 
                                               labels_getter=get_label_from_tile_path)

100%|██████████| 1938980/1938980 [28:11<00:00, 1146.48it/s] 


In [10]:
train_size = 0.8
validation_size = 0.2
test_size = 0
patient_manager.split(train_size=train_size, validation_size=validation_size, test_size=test_size, random_state=seed)