In [1]:
import os
import sys
sys.version_info

sys.version_info(major=3, minor=6, micro=3, releaselevel='final', serial=0)

In [11]:
import pandas as pd
import numpy as np
print(pd.__version__)
print(np.__version__)

import tqdm
import openslide
import cv2 #opencv for image segmentation
import matplotlib.pyplot as plt
import matplotlib

from PIL import Image

0.21.1
1.14.0


* Go through the slides.csv file
* Split into test/validate/train groups 
* using train, apply class labels (10 equal groups of outcomes)
    *    use groups to apply to valid/test
  


In [3]:
np.random.seed(20180502) #so we get the same groupings of train/valid/test
df = pd.read_csv('slides.csv')

# From Olivier to use just tissue type 01
def get_tissue_type(x):
    return x.split('-')[4]

tissue_type = df.slide_file_name.apply(get_tissue_type)
df = df.loc[tissue_type == '01',:].copy()


## equal groups?  is that ok or should it be biased to train?
valid_days_to_death = df['days_to_death'].dropna()
train_groups = np.random.choice(['train','train','valid','test'],len(valid_days_to_death))
df.loc[valid_days_to_death.index,'train_val_test'] = train_groups

In [4]:
cond = df.loc[valid_days_to_death.index,'train_val_test']=='train'

df['class'] =  np.nan
df.loc[df.loc[:,'days_to_death'] < 180,'class'] = 0
df.loc[df.loc[:,'days_to_death'] > 1500,'class'] = 1

In [5]:
df['class'].value_counts()

0.0    41
1.0    33
Name: class, dtype: int64

In [28]:
df.loc[df['class']==0,'train_val_test'].value_counts()


train    19
valid    12
test     10
Name: train_val_test, dtype: int64

Now groups and labels are defined in the dataframe.

Iterate over all the data directories and generate training tree structure:

`
./data/
  /train
    /0
    /1
    ...
    /9
  /valid
    /0
    /1
    ...
    /9
`
We can have different sized images and fast.ai will downsize to 256x256 for us.  We will use this feature.


In [30]:
def get_set_class_dir(filename,base='./'):
    img_class = df.loc[df['slide_file_name'].apply(str.upper)==filename.upper(),'class'].values[0]
    img_group = df.loc[df['slide_file_name'].apply(str.upper)==filename.upper(),'train_val_test'].values[0]
    
    if pd.isnull(img_group):
        img_group='other'
    set_class_dir = '/'.join([img_group,str(img_class)]) + '/'
    return base+set_class_dir

In [7]:
def pull_samples(slide_fn, level=1):
    slide = openslide.open_slide(slide_fn)
    
    keep = []
    empty = []
    bad = []
    tiles = []

    max_empty=0.50
    slide_dim = slide.level_dimensions[level]
    slide_dim_0 = slide.level_dimensions[0]


    slide_width, slide_height = slide_dim
    slide_width_0, slide_height_0 = slide_dim_0

    tile_rgba = np.array(slide.read_region((0,0),level,slide_dim))
    gray = cv2.cvtColor(tile_rgba,cv2.COLOR_BGR2GRAY)
    ret,thresh = cv2.threshold(gray,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)

    #plt.imshow(tile_rgba)
    len(range(0, slide_height, 256)), len(range(0, slide_width, 256))

    pct_e = []
    for row in tqdm.tnrange(0, slide_height, 256):
        for col in range(0, slide_width, 256):
            tile_width = min(256, slide_width - col)
            tile_height = min(256, slide_height - row)

            tile_loc_0 = ( col * slide_width_0 // slide_width, row * slide_height_0 // slide_height  )
            tile = slide.read_region(tile_loc_0, level, (tile_width, tile_height))

            thresh_area = thresh[row:(row+tile_height), col:(col+tile_width)]
            area_sum = (thresh_area > -1).sum()
            if area_sum == 0:
                bad.append(tile)
            else:
                pct_empty = (thresh_area == 0).sum()/area_sum
                pct_e.append(pct_empty)
                
                if pct_empty < max_empty: 
                    keep.append(tile)
                else:
                    empty.append(tile)

    slide.close()
    return keep, empty

In [21]:
def tile_slides(slide_array,display=True):
    '''
    Create single 3x3 slide of each subsample
    
    '''
    out_side_length = 3
    num_slides = out_side_length**2

    
    (img_height,img_width) = slide_array[0].size #first image H x W
    total_width  = out_side_length * img_width
    total_height = out_side_length * img_height
    new_im = Image.new('RGB', (total_width, total_height))
    
    slide_sample_idx = np.random.choice(range(len(slide_array)), size=num_slides)

    x_offset,y_offset = (0,0)
#    for i, slide_idx in tqdm.tqdm_notebook(enumerate(slide_sample_idx), total=num_slides):
    for i, slide_idx in enumerate(slide_sample_idx[:num_slides]):

        new_im.paste(slide_array[slide_idx],(x_offset,y_offset))
        x_offset =(x_offset + img_width) % total_width
        if ((i+1) % out_side_length) == 0:
            y_offset += img_height
    if display:
        plt.imshow(new_im)
        
    return new_im

In [29]:
def get_images_put_in_dir(slide_filename,save_path,
                          display=False):
    
    keep,empty = pull_samples(slide_filename)
    for j in range(20):
        tiled_img = tile_slides(keep,display)
        save_name = os.sep.join(path+[file]).split(os.sep)[-1].split('.')[0]
        matplotlib.image.imsave(save_path+f'{save_name}_{j}.png',tiled_img)

In [31]:
BASE = './tmp_tiled/' #base to put in the selected images

counter = 0
for root, dirs, files in os.walk("./data/"):
    path = root.split(os.sep)
    for file in files:
        if file.endswith('.svs') and os.path.basename(file).split('-')[4] == '01':
            directory = get_set_class_dir(file,BASE)
            
            if (not 'nan' in directory) and ('train' in directory or 'valid' in directory):
                print(directory)
                print(os.sep.join(path+[file]))
                if not os.path.exists(directory):
                    os.makedirs(directory)
                get_images_put_in_dir(os.sep.join(path+[file]),directory)
            
#            counter += 1 
#    if counter == 50:
#        break

print(counter)

./tmp_tiled/train/1.0/
./data/58ae7226-5ec1-4f94-a49f-e9fa6893ec96/TCGA-DD-A3A5-11A-01-TS1.7E6F0BBB-FC7E-42F7-9F85-063057B2F417.svs



./tmp_tiled/valid/1.0/
./data/d4dbb21c-2943-436a-a51b-8ead4dfcbdde/TCGA-DD-A1E9-11A-01-TSA.fcc3e0b9-359d-4455-91e9-954e58f28b4e.svs



./tmp_tiled/train/0.0/
./data/bed239b1-8983-41b5-8b9b-56da502201a1/TCGA-KR-A7K0-01A-01-TS1.76635723-DAD6-4012-81CB-709DD7348018.svs



./tmp_tiled/train/0.0/
./data/841d04e4-4059-4d5c-bc0c-5705325beb37/TCGA-DD-A11B-11A-01-TSA.c795fc59-9efa-439c-8a41-fa26dfc50163.svs



./tmp_tiled/valid/1.0/
./data/dd6be1ec-1bda-4f8a-958b-7ffa76e6027a/TCGA-DD-A3A2-01A-01-TS1.CFD4366B-BF5F-4EDC-ABD7-F26DD27E3AC3.svs



./tmp_tiled/train/0.0/
./data/48070128-d0f0-40fc-b6df-a7b45a4dc87f/TCGA-BC-A10W-11A-01-TS1.cbeb048c-0334-4eb2-9cd8-b2aee00f519e.svs



./tmp_tiled/train/1.0/
./data/bfd27064-177d-4ba8-8597-7e14d3df2f32/TCGA-BC-A110-01A-01-TS1.0167447d-b826-4b82-b625-a55ae96e6691.svs



./tmp_tiled/train/1.0/
./data/5539b26d-4942-4d53-9cb1-6b164f8a1f63/TCGA-DD-A3A5-01A-01-TS1.DC714E73-41A8-4844-BEE1-37A2954E2016.svs



./tmp_tiled/train/0.0/
./data/9b4526e5-cb6d-4ce2-815f-89c9a270f9f3/TCGA-DD-A11B-01A-01-TSA.972d3cc6-c2d2-40e2-b374-bf443aabf003.svs



./tmp_tiled/train/1.0/
./data/323d7a05-43fb-4c5a-95ca-d8e8216671c7/TCGA-2Y-A9GV-01A-01-TS1.358BACC7-2281-4653-89DC-3B1688A0492E.svs



./tmp_tiled/valid/0.0/
./data/87f1db8d-5a13-4f0c-91bb-d38827781064/TCGA-DD-AAC8-01A-01-TSA.E1E791FA-90C7-4D55-8BF7-7F26BA90EB1E.svs



./tmp_tiled/train/1.0/
./data/3a0312ba-ede7-4e2a-a4c5-528ce74d7811/TCGA-FV-A4ZP-01A-01-TS1.7FD29675-0E75-4837-B78C-3FFB60C20D78.svs



./tmp_tiled/valid/0.0/
./data/274c9110-6a25-482f-846e-58ffac46a028/TCGA-BC-A10W-01A-01-BS1.295e1919-7da3-454a-ac81-a50abcf635a5.svs



./tmp_tiled/train/0.0/
./data/88f3814e-0651-42db-bcc1-1423ff62455a/TCGA-BC-A112-11A-01-TS1.2acbbef2-4844-4ba0-b3f6-fc76bcf100a7.svs



./tmp_tiled/train/0.0/
./data/e232818d-8ca1-48d9-a636-d00da0a1c020/TCGA-UB-A7MD-01A-01-TS1.57F02824-3041-424C-A957-FDB843648038.svs



./tmp_tiled/train/0.0/
./data/890305da-6c14-413b-a8dd-3eb1d40b849b/TCGA-DD-A3A8-11A-01-TS1.BAF87ED8-D96C-481A-9D30-0A20A8B60803.svs



./tmp_tiled/train/0.0/
./data/6e697dcf-cb00-4b67-b6d9-9069e714633f/TCGA-CC-5261-01A-01-BS1.f0d38315-4180-4ab6-be13-4ce789129f9c.svs



./tmp_tiled/valid/1.0/
./data/5413d241-e09e-4daa-935f-2ed4745b6814/TCGA-2Y-A9GT-01A-01-TS1.BFE1248B-BB25-4828-AE6E-A4973F4A116C.svs



./tmp_tiled/train/0.0/
./data/2627c118-edb2-4272-9ec4-c3d5e7d035d7/TCGA-DD-A39Y-01A-01-TS1.420E31BE-005F-4149-A7BA-8E24C985D85D.svs



./tmp_tiled/train/1.0/
./data/8d5f97e3-3211-48ca-a125-85cdffdffed5/TCGA-BC-A110-11A-01-TS1.08bd00bd-2b69-448f-80e9-fa0bae6b37ca.svs



./tmp_tiled/train/0.0/
./data/60417601-d870-4075-8c63-6185328881bb/TCGA-5R-AAAM-01A-01-TS1.13E28797-EA17-4400-9FBE-7509DF2BFE80.svs



./tmp_tiled/valid/0.0/
./data/47959efe-48f1-418f-9214-9e2bd85a1083/TCGA-CC-A1HT-01A-01-TSA.2400d8e5-f45f-48ab-b6d6-47efb5e6dfff.svs



./tmp_tiled/valid/0.0/
./data/164ecda6-8c69-4551-a361-9613990fc488/TCGA-GJ-A6C0-01A-01-TS1.1E332335-569F-44DD-B501-EA192C11F10D.svs



./tmp_tiled/train/1.0/
./data/855c8487-f437-4759-bd10-159d34c063a8/TCGA-DD-AACC-01A-01-TS1.15BF9548-D49B-4C1C-BCF5-346FC12D4A22.svs



./tmp_tiled/train/0.0/
./data/79450f6c-44dd-4674-9097-cb27701bd63c/TCGA-CC-5262-01A-01-BS1.51acfff7-5429-489c-bbe2-d665ba21977f.svs



./tmp_tiled/train/0.0/
./data/6fbbe4c2-733a-403f-9e2d-ff7ae737318a/TCGA-BC-A10Z-01A-01-BS1.90757e0f-9cd3-43d8-a777-f91b0a806bfd.svs



./tmp_tiled/valid/0.0/
./data/857ec1c7-f380-4426-b2d8-beb4a5b5dfd7/TCGA-BC-A10Z-01A-01-TS1.146fde04-6d55-4072-ac45-20bd4ebdedc6.svs



./tmp_tiled/train/0.0/
./data/ad0bf2f8-0cbc-4527-aef1-b6eb6907b5fc/TCGA-DD-AADF-01A-01-TS1.0C90B7FF-7AF6-4BC8-8BE4-0179B3B6EFE0.svs



./tmp_tiled/valid/1.0/
./data/af7650ae-fc47-4a56-995b-8572a0a1fa4f/TCGA-DD-A11D-11A-01-TSA.6c33ecbf-c190-4514-bef9-95144905f3b8.svs



./tmp_tiled/valid/0.0/
./data/a81a6b58-e4b0-469f-aa43-5a8149e656c8/TCGA-EP-A2KC-01A-01-TSA.F85E5978-9FA0-4783-8AF2-D3108A6D2081.svs



./tmp_tiled/train/0.0/
./data/98d41aba-53e3-4a54-9ec2-1dd6eadc52e7/TCGA-GJ-A9DB-01A-01-TS1.8CC8AD70-5C14-4999-939E-5FEE7D06EDE3.svs



./tmp_tiled/valid/1.0/
./data/82c4ed3f-aa90-4620-a1d0-faf514040e5d/TCGA-DD-A116-11A-01-TS1.459e75e9-20fe-4e3b-8a49-62ffe4d37991.svs



./tmp_tiled/valid/0.0/
./data/613065d0-7a3e-4c21-a342-119c1cd31971/TCGA-ED-A8O6-01A-01-TSA.15FB0207-C303-4DB0-AEAD-A1C927873963.svs



./tmp_tiled/valid/1.0/
./data/9331ae22-3a27-42b8-8aac-4daeca802ab5/TCGA-DD-A3A6-01A-01-TS1.617435DE-4B82-4206-AC4F-9695F3ECE9E4.svs



./tmp_tiled/train/1.0/
./data/9dcf3f33-6540-4ad9-aa52-0ebb8af1d8e6/TCGA-NI-A4U2-01A-01-TS1.405C6DE0-E40F-45BD-B25C-419EA4596780.svs



./tmp_tiled/train/1.0/
./data/7f8e9c4f-c675-4515-9e29-8b3a781551a1/TCGA-NI-A4U2-11A-01-TS1.CAFE9F20-895C-44D8-9876-1AFA26D41A23.svs



./tmp_tiled/valid/0.0/
./data/db14aba1-ab19-4495-a4b3-f2112a7d41a7/TCGA-GJ-A6C0-11A-01-TSA.16CA7876-4635-4449-89A9-E55F39FDD9DA.svs



./tmp_tiled/train/0.0/
./data/19b64c81-bbc2-47e6-88f5-58d9c98fdda2/TCGA-BC-A10Z-11A-01-TS1.f37d6139-cd3f-4b74-a79b-f549b190affa.svs



./tmp_tiled/train/1.0/
./data/c9ccf7f6-c9b2-4c3e-af48-53e0c948481b/TCGA-DD-A4NS-01A-01-TS1.954E6109-C168-430C-BB03-A65EA378967B.svs



./tmp_tiled/train/1.0/
./data/e9e4606f-7fd7-400c-86f0-77812dead722/TCGA-DD-A115-01A-01-TS1.010a967c-4419-4bd5-9afd-7a8ba23906dc.svs



./tmp_tiled/train/0.0/
./data/b815d560-58d1-4cb2-a387-5238a83ca6cb/TCGA-DD-AACL-01A-01-TS1.3975F3BD-B7B1-46B9-8BE5-66E761C500EA.svs



./tmp_tiled/train/0.0/
./data/0daf9fde-edd0-4994-b64e-6286d1a6b9dd/TCGA-DD-A39Y-11A-01-TS1.3FBA1D22-203E-426B-A94E-7DF665BE864D.svs



./tmp_tiled/valid/1.0/
./data/3e902f60-64da-4810-82d5-9e66e1d59c6b/TCGA-DD-A3A2-11A-01-TS1.B5A50812-E525-4410-ADB1-61E884B093EC.svs



./tmp_tiled/train/1.0/
./data/145871c5-ae68-4ee6-be97-b70794c17d27/TCGA-BC-A110-01A-01-BS1.61d822ba-c177-4fad-b5b7-d71a73b9127d.svs



./tmp_tiled/train/1.0/
./data/60d9b59b-9bc1-45e9-9b24-837c68a4522a/TCGA-DD-A4NS-11A-01-TS1.E249C55A-92A0-4945-A983-99ADFBCA46AF.svs



./tmp_tiled/train/1.0/
./data/3c734479-14c1-4f82-b005-1e7ad7f6eefa/TCGA-DD-A3A6-11A-01-TSA.BCD0DE1F-CF1E-465E-BA64-C7EC2D4F4C62.svs



./tmp_tiled/valid/0.0/
./data/90b75c00-59f6-4f34-ad9a-57da693f402f/TCGA-BC-A10W-01A-01-TS1.69facfc2-1915-4190-a679-fa0c43c8d909.svs



./tmp_tiled/train/0.0/
./data/f55a3f0f-c018-4f5d-822c-7221b666b59a/TCGA-CC-5264-01A-01-BS1.4f52c6b3-9028-4338-abfc-7aee8db4e8c1.svs



./tmp_tiled/valid/1.0/
./data/9def26cf-5175-452c-aedb-912924b643d0/TCGA-DD-A116-01A-01-TSA.10a5eadc-55da-49d1-9455-62db79a55f81.svs



./tmp_tiled/train/1.0/
./data/f8be1156-08cd-4e1f-9e35-2e7997f4d392/TCGA-DD-A115-11A-01-TSA.222f45b7-e53f-494d-936a-20d55857181a.svs



./tmp_tiled/valid/0.0/
./data/92bff9ae-58c3-4613-9ae4-fe51924a58f3/TCGA-DD-A3A8-01A-01-TS1.1E2355DA-F465-40D3-B0DB-1DBAB4225F58.svs



./tmp_tiled/valid/0.0/
./data/5214d058-eddd-4cb3-a75c-3973abcb6ea0/TCGA-DD-AACZ-01A-01-TS1.5DBBC6C3-ABFB-48BF-8908-005E65AC0231.svs



./tmp_tiled/train/0.0/
./data/80ced2b0-058b-4fab-aeeb-397945ee356e/TCGA-CC-5263-01A-01-BS1.af815254-cfa9-4c81-9ef6-5438b9f5e85b.svs



./tmp_tiled/valid/0.0/
./data/b830b77d-e5e1-462c-8ef8-3adbc3796845/TCGA-DD-AADM-01A-01-TS1.BFD06147-1675-446B-B48D-800EB504A477.svs



./tmp_tiled/train/1.0/
./data/1824d29c-21df-466e-ae6b-1cd209baa8da/TCGA-DD-A115-01A-01-BS1.0839a9e5-5b00-4722-9b9a-04ea65382018.svs



0
