In [1]:
import os
import sys
sys.version_info

sys.version_info(major=3, minor=6, micro=3, releaselevel='final', serial=0)

In [2]:
import pandas as pd
import numpy as np
import tqdm
print(pd.__version__)
print(np.__version__)

0.22.0
1.14.2


* Go through the slides.csv file
* Split into test/validate/train groups 
* using train, apply class labels (10 equal groups of outcomes)
    *    use groups to apply to valid/test
  


In [4]:
np.random.seed(20180502) #so we get the same groupings of train/valid/test

df = pd.read_csv('../download/slides.csv')
def get_tissue_type(x):
    return x.split('-')[4]

tissue_type = df.slide_file_name.apply(get_tissue_type)
df = df.loc[tissue_type == '01',:].copy()

In [7]:
df['days_to_death'] = df.days_to_death.fillna(df.days_to_last_follow_up)

In [10]:
## equal groups?  is that ok or should it be biased to train?
valid_days_to_death = df['days_to_death'].dropna()
train_groups = np.random.choice(['train','train','valid','test'],len(valid_days_to_death))
df.loc[valid_days_to_death.index,'train_val_test'] = train_groups

In [11]:
cond = df.loc[valid_days_to_death.index,'train_val_test']=='train'
valid_days_to_death[cond]

class_labels,class_bins = pd.qcut(valid_days_to_death[cond],10,labels=range(10),retbins=True)
## set min to 0 & max to np.inf so we are in-bounds for all possible values
class_bins[0] = 0
class_bins[-1] = np.inf

df.loc[valid_days_to_death[~cond].index,'class'] = \
    pd.cut(valid_days_to_death[~cond],bins=class_bins,labels=range(10))
df.loc[valid_days_to_death[cond].index,'class'] = \
    pd.cut(valid_days_to_death[cond],bins=class_bins,labels=range(10))


In [12]:
df['class'].value_counts()

6.0    54
0.0    54
1.0    50
8.0    49
4.0    49
7.0    48
9.0    46
3.0    41
5.0    39
2.0    35
Name: class, dtype: int64

Now groups and labels are defined in the dataframe.

Iterate over all the data directories and generate training tree structure:

`
./data/
  /train
    /0
    /1
    ...
    /9
  /valid
    /0
    /1
    ...
    /9
`
We can have different sized images and fast.ai will downsize to 256x256 for us.  We will use this feature.


In [13]:
def get_set_class_dir(filename,base='./'):
    img_class = df.loc[df['slide_file_name'].apply(str.upper)==filename.upper(),'class'].values[0]
    img_group = df.loc[df['slide_file_name'].apply(str.upper)==filename.upper(),'train_val_test'].values[0]
    if pd.isnull(img_group):
        img_group='other'
    set_class_dir = '/'.join([img_group,str(img_class)]) + '/'
    return base+set_class_dir

In [14]:
import openslide
import cv2 #opencv for image segmentation
import matplotlib.pyplot as plt
import matplotlib

def get_images_put_in_dir(slide_filename,save_path,
                          min_empty=0.3,max_empty=0.75,min_std=2.,
                          max_sample_mult=10,
                          display=False):
    
    slide = openslide.OpenSlide(slide_filename)
    (x_size,y_size) = slide.dimensions
    max_dim = np.random.randint(256,256*int(max_sample_mult)) # fast.ai will scale all to 256
    rand_img_empty_fract = np.zeros((40,1))
    for j in range(len(rand_img_empty_fract)):

        x_loc = np.random.randint(x_size-max_dim)
        y_loc = np.random.randint(y_size-max_dim)

        #10024,10024,4 -> last dime is A, can ignore?
        rgba_levels = np.array(slide.read_region((x_loc,y_loc),0,(max_dim,max_dim))) 

        gray = cv2.cvtColor(rgba_levels,cv2.COLOR_BGR2GRAY)
        ret,thresh = cv2.threshold(gray,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)

        pct_empty = (thresh == 0).sum()/(thresh > -1).sum()

        rand_img_empty_fract[j] = pct_empty
        if pct_empty < max_empty and pct_empty > min_empty and  gray.std()>min_std:
            if display:
                plt.figure()
                plt.subplot(1,2,1)
                plt.imshow(rgba_levels)
                plt.subplot(1,2,2)
                plt.imshow(thresh)
                plt.title(str(x_loc) +  " x " + str(y_loc))
                plt.suptitle(pct_empty) 
            
            save_name = os.sep.join(path+[file]).split(os.sep)[-1].split('.')[0]
            matplotlib.image.imsave(save_path+f'{save_name}_{j}.png',rgba_levels[:,:,:3])

In [15]:
BASE = '/DATA/GDC/lana/samples/' #base to put in the selected images

counter = 0
for root, dirs, files in os.walk("/DATA/GDC/lana"):
    path = root.split(os.sep)
    for file in files:
        if file.endswith('.svs') and os.path.basename(file).split('-')[4] == '01':
            print(os.sep.join(path+[file]))
            directory = get_set_class_dir(file,BASE)

            if not os.path.exists(directory):
                os.makedirs(directory)
            get_images_put_in_dir(os.sep.join(path+[file]),directory,max_sample_mult=2)

            counter += 1 
#    if counter == 50:
#        break

print(counter)

/DATA/GDC/lana/f6e54abc-1f02-4a5d-b48d-3f8207fb1ea4/TCGA-DD-AADD-01A-01-TS1.3A49647D-393D-40CE-8B9F-3B2F779D14C3.svs
/DATA/GDC/lana/b81b18ff-f9a7-412f-ba40-57cb63a73eba/TCGA-DD-AADR-01A-01-TS1.E8604D51-F569-4872-977A-0547481DA34D.svs
/DATA/GDC/lana/4a5e7e89-db7e-40dd-b6fb-fa3a70abbc47/TCGA-BC-A10X-11A-01-TS1.a10e3fb3-3138-4be4-88c8-8c9caf6e93df.svs
/DATA/GDC/lana/7026c9fc-63ba-4729-aec0-8cfe92a32172/TCGA-LG-A9QD-01A-01-TSA.0E90AD01-6489-4BB0-A0DA-D622F3E8682F.svs
/DATA/GDC/lana/0e056a2f-9de2-4c0f-a15e-30ac36720c00/TCGA-DD-A73E-01A-01-TSA.83219588-13BC-457C-9239-7424C7400509.svs
/DATA/GDC/lana/53db4b1b-fb5e-4002-8d5b-aa81d92bb6b2/TCGA-2Y-A9HB-01A-01-TS1.5767FBE4-1C24-43F5-B13B-D484888D5EEE.svs
/DATA/GDC/lana/3e902f60-64da-4810-82d5-9e66e1d59c6b/TCGA-DD-A3A2-11A-01-TS1.B5A50812-E525-4410-ADB1-61E884B093EC.svs
/DATA/GDC/lana/dc2ae5df-fc57-4ece-9a2e-0798555e2814/TCGA-CC-A3MB-01A-01-TSA.75944F1B-5B76-4635-B5E1-415837BB30FB.svs
/DATA/GDC/lana/3a0312ba-ede7-4e2a-a4c5-528ce74d7811/TCGA-FV-A4ZP

/DATA/GDC/lana/bfe943e2-2dcb-4fba-9f92-04757e9c7228/TCGA-ZS-A9CD-01A-01-TS1.7147A689-BCDB-404A-99EA-A0E900E24C5B.svs
/DATA/GDC/lana/ddde9cb5-b61c-4383-b7fe-90ded5d0547f/TCGA-DD-AAD0-01A-01-TS1.77054260-4170-4628-B32E-763326244BF9.svs
/DATA/GDC/lana/06823786-5d6f-4cc4-a00a-e52499765008/TCGA-ES-A2HS-11A-01-TSA.0F0A1F59-FF9D-43D8-9DBE-FEE01197806A.svs
/DATA/GDC/lana/4842a136-b2bf-4c11-b38d-576c97c872c6/TCGA-BC-4072-01B-01-TS1.cdb88c23-d360-4400-9017-d8e2f253ddee.svs
/DATA/GDC/lana/19dd7e06-55af-45ba-ba44-987b781f23fd/TCGA-DD-A113-01A-01-TS1.16cc95a5-e64d-47ea-9d87-8e2ec382192a.svs
/DATA/GDC/lana/5472c4ee-8366-4088-9a36-1a24e4ea0c10/TCGA-DD-A3A3-11A-01-TS1.2C007308-0CC7-48D8-B582-C9A68F5B0321.svs
/DATA/GDC/lana/903dcacb-8e04-4639-b1a7-c62777836f5e/TCGA-DD-AAEH-01A-01-TS1.1152D49E-5195-448D-8906-F95398C55D1F.svs
/DATA/GDC/lana/ecf9d2e5-fa60-4ec3-a923-5bc262cc9037/TCGA-DD-AADP-01A-01-TS1.D5894AA9-4BD8-4CAF-9301-C090FFBEF499.svs
/DATA/GDC/lana/b13f7da7-a2cb-41ca-a8d7-0c92d334674a/TCGA-DD-AAE0

/DATA/GDC/lana/2138a9e7-9a00-4d1c-9aae-287f9bf2435d/TCGA-BC-4073-01B-01-BS1.e54f54f3-c984-4935-b2f9-a074b6e255e0.svs
/DATA/GDC/lana/c06d6cc8-b639-4306-9e8a-2211b356ba62/TCGA-DD-A4NB-01A-01-TS1.D7195FFB-1981-4337-BC94-EE70FC6FC67B.svs
/DATA/GDC/lana/3cd62718-0b5a-4d88-bdbb-b5207bf9c423/TCGA-FV-A3I0-01A-01-TS1.50B6F5C8-80A4-4884-B169-698E44C8AB1D.svs
/DATA/GDC/lana/ee6007a3-e100-4bc2-9678-3d3a3c413b67/TCGA-DD-AACG-01A-01-TSA.5838C953-3E20-444A-A748-61C40B259432.svs
/DATA/GDC/lana/6af5400d-991b-45ed-ba07-e22f135be64d/TCGA-UB-A7MC-01A-01-TSA.4EC5DB39-BC40-4687-BFFD-258B7E7BBA6B.svs
/DATA/GDC/lana/8376a582-c214-4c88-bb16-caa545760676/TCGA-DD-A1EH-11A-01-TSA.de8f9703-b70c-4c31-8f04-f8a2246737b9.svs
/DATA/GDC/lana/3ca46237-80aa-49a3-aa25-c7faa07d0a67/TCGA-BC-A10R-01A-01-TS1.04d01a32-cbc3-401a-9381-b65c75e90355.svs
/DATA/GDC/lana/08d09009-b807-48cd-8a7d-a06fd0e41c57/TCGA-T1-A6J8-01A-01-TS1.8578F500-AD47-4754-B25D-811181EA0313.svs
/DATA/GDC/lana/8a1cbd96-1689-436d-b7ca-3e2e83ff399c/TCGA-DD-AADY

/DATA/GDC/lana/ad0bf2f8-0cbc-4527-aef1-b6eb6907b5fc/TCGA-DD-AADF-01A-01-TS1.0C90B7FF-7AF6-4BC8-8BE4-0179B3B6EFE0.svs
/DATA/GDC/lana/e5ef35d5-540e-4b3c-b5c8-c4076c928db0/TCGA-FV-A23B-11A-01-TS1.B1103028-122A-485A-B65B-25285DBCA8E5.svs
/DATA/GDC/lana/8a426966-b9c3-46d5-8be4-bc7ce1b3a8ee/TCGA-ZS-A9CE-01A-01-TS1.819EDFAA-4944-41F7-9811-10F384B1059F.svs
/DATA/GDC/lana/47959efe-48f1-418f-9214-9e2bd85a1083/TCGA-CC-A1HT-01A-01-TSA.2400d8e5-f45f-48ab-b6d6-47efb5e6dfff.svs
/DATA/GDC/lana/d9f094d4-fff1-4bde-bf80-ba30c6ca441a/TCGA-RC-A7SH-01A-01-TS1.C5DDA158-4317-4D05-BEFA-5EFC82CF2F4E.svs
/DATA/GDC/lana/3cd37014-9076-4041-8799-9c59b2d9b5d1/TCGA-DD-AACO-01A-01-TS1.D92FB7AF-1ED3-44B8-BD9A-76DEFCE0D163.svs
/DATA/GDC/lana/cf84ff5a-f1a2-4a33-a7ce-aa29bb9f2106/TCGA-G3-A25Y-01A-01-TSA.F76EC89B-AE96-4E49-A7C6-901F4FFCBEF0.svs
/DATA/GDC/lana/d7e1c168-054e-4ba7-9f77-80728772248c/TCGA-DD-A1EF-11A-01-TSA.f665409b-9947-406a-814e-c780a737e1e6.svs
/DATA/GDC/lana/b1fa0bc6-b324-4216-ad74-c01b48c593cc/TCGA-DD-AADC

/DATA/GDC/lana/5214d058-eddd-4cb3-a75c-3973abcb6ea0/TCGA-DD-AACZ-01A-01-TS1.5DBBC6C3-ABFB-48BF-8908-005E65AC0231.svs
/DATA/GDC/lana/db7993c5-00c6-4ea9-a7ab-13227ac63c43/TCGA-DD-A1EG-01A-01-TSA.0BC94007-69D3-4ED1-8E6C-9D88C27ED705.svs
/DATA/GDC/lana/2c9c96f0-c290-4027-84e4-f0123a79fd3d/TCGA-ED-A5KG-01A-01-TSA.01174EE3-DED8-4F7B-AB3E-11806D391579.svs
/DATA/GDC/lana/7af36238-6949-4548-8d82-25c7d4c1bb05/TCGA-BC-A10Y-11A-01-TS1.81c40b2f-1516-4800-b865-d30fc84d11f0.svs
/DATA/GDC/lana/cd1dc9b8-57f8-4a76-ba8f-0c4f577c2aea/TCGA-DD-AAE6-01A-01-TS1.EFF126E1-2E80-4620-8634-0A0665D77207.svs
/DATA/GDC/lana/ee39951b-7712-43e8-af54-9dfd7d4c0b6d/TCGA-DD-AAW3-01A-01-TSA.D5DA7E65-77C7-4D19-992D-ACC0A8AA84A5.svs
/DATA/GDC/lana/f8be1156-08cd-4e1f-9e35-2e7997f4d392/TCGA-DD-A115-11A-01-TSA.222f45b7-e53f-494d-936a-20d55857181a.svs
/DATA/GDC/lana/c9eb9a2e-5d83-47ca-951c-74c51e4f9b2f/TCGA-FV-A3I1-11A-01-TS1.53AE8FAE-76C8-4DF8-A7A4-D2D7341AD540.svs
/DATA/GDC/lana/ba1cd1f9-0c84-4261-b97a-5bfd3d42bc7b/TCGA-DD-A4NG

/DATA/GDC/lana/5b3ef31a-d3a1-4b7d-a384-538eab6bbe84/TCGA-MR-A520-01A-01-TS1.AB205181-4866-4642-8B1C-F175B47F3D01.svs
/DATA/GDC/lana/7e373525-1146-46bd-9bdb-bc70b0007c50/TCGA-DD-A1EG-11A-01-TSA.ADAA8B6E-C7DB-4EB9-A0EC-250F9EA9B96E.svs
/DATA/GDC/lana/27a1dfdc-52c6-4a5a-82cd-8852f08eb03a/TCGA-DD-A1EH-01A-01-TS1.a8728f9e-ae4c-4c71-aabc-c055f5115669.svs
/DATA/GDC/lana/0daf9fde-edd0-4994-b64e-6286d1a6b9dd/TCGA-DD-A39Y-11A-01-TS1.3FBA1D22-203E-426B-A94E-7DF665BE864D.svs
/DATA/GDC/lana/42aaaf00-5009-4997-98ce-8918c3a2e854/TCGA-DD-A11C-11A-01-TSA.c8fbc9d2-e344-4efd-91b6-bfa81cd5efee.svs
/DATA/GDC/lana/973a827d-a120-4fa4-aaa4-0c34b05964a8/TCGA-MI-A75H-01A-01-TS1.0CCAF184-5D4E-4411-A071-F356F92FC0A9.svs
/DATA/GDC/lana/d8aecc05-d616-4e2c-9616-ccbf3b1eee51/TCGA-BC-A10T-01A-01-TSA.18069ef7-abd8-408b-b2cf-40211f07f341.svs
/DATA/GDC/lana/3cbea932-f147-4037-8c4b-7946f7e1a984/TCGA-DD-A1EC-11A-01-TSA.6b3a2e2d-dee8-40c6-b4ab-a782e177dfe7.svs
/DATA/GDC/lana/e5b68616-2bab-4009-b5f6-8308caf42277/TCGA-DD-AADQ

/DATA/GDC/lana/586f68a9-bada-438c-a241-7e8cb69fed01/TCGA-ZS-A9CF-02A-01-TS1.CFD8CF47-72D9-4143-9A64-03BA89F14D57.svs
/DATA/GDC/lana/edf422f3-ea12-4050-b8bf-3acfecddb14c/TCGA-RC-A6M5-01A-01-TS1.94E4716B-5725-4FA7-B6FF-DB8BA37F7454.svs
/DATA/GDC/lana/bd5435df-136c-4541-9bdf-8a7d037f6593/TCGA-WQ-AB4B-01A-01-TS1.6A53BDE2-A140-41C6-8E65-F14C59B33453.svs
/DATA/GDC/lana/80981fd3-20d8-4d1d-aed0-6d90ee247bd7/TCGA-2Y-A9HA-01A-01-TS1.A9F7A6F0-6286-4B08-AA12-A3D959EB54C0.svs
/DATA/GDC/lana/d2a91246-0e6f-42f7-a932-5bbcd273eab1/TCGA-DD-A1EJ-01A-01-TSA.5472bfa9-8d10-4d2c-8df5-e056acd74216.svs
/DATA/GDC/lana/88f3814e-0651-42db-bcc1-1423ff62455a/TCGA-BC-A112-11A-01-TS1.2acbbef2-4844-4ba0-b3f6-fc76bcf100a7.svs
/DATA/GDC/lana/98996519-3dec-41cd-988b-6dafb705fc5f/TCGA-DD-A118-11A-01-TS1.631ac2ee-22b2-4bae-b40e-32ad82d19758.svs
/DATA/GDC/lana/a0c98803-af7b-46f4-accd-543f7b640fc5/TCGA-DD-A3A1-11A-01-TS1.40CA02B0-CBF2-40A0-9265-6765685B5818.svs
/DATA/GDC/lana/2bba609b-6c4a-4150-ad59-bd54f3f86b29/TCGA-DD-A73B

In [16]:
print("cool")

cool
