In [83]:
import os 
import pandas as pd
import subprocess
import glob
import shutil
N_GB = 120
TOTAL_MB_SIZE = N_GB*1000

ROOT_DIR ='datasets'

MANIFEST_DIR = 'dataset_manifest'
DEST_SPLITS = 'id_subsets_large'
DATASET_DIR = 'dataset_large'

# change this to 'Lung' to obtain only its related subset
PRIMARY_SITE_TYPE = '*'


MANIFEST_FILEPATHS = glob.glob(os.path.join(ROOT_DIR, MANIFEST_DIR, PRIMARY_SITE_TYPE, '*.txt'))#glob.glob(os.path.join(MANIFEST_DIR, '*', '*.txt'))
N_SUBSETS = len(MANIFEST_FILEPATHS)
SUBSET_SIZE = TOTAL_MB_SIZE // N_SUBSETS

# REQUIRES DOWNLOAD and existence of the gdc-client in the same directory as the one of this notebook

#https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Data_Download_and_Upload/
for path in MANIFEST_FILEPATHS:    
    #path = os.path.join(MANIFEST_DIR, filename)
    filename = os.path.basename(path).replace('-', '_')
    repo_name, subset_name = filename.replace('.txt', '').split('_')
    dest_path = os.path.join(ROOT_DIR, DATASET_DIR, repo_name, subset_name)
    tmp_dest_path = os.path.join(dest_path, 'tmp')

    os.makedirs(dest_path, exist_ok=True)
    os.makedirs(tmp_dest_path, exist_ok=True)

    df = pd.read_csv(path, sep='\t')
    df['size'] = (df['size']/10**6).astype(int)
    subset = df[df['size'].cumsum() <= SUBSET_SIZE]
    os.makedirs(ROOT_DIR, DATASET_DIR, DEST_SPLITS, exist_ok=True)
    subset_manifest = os.path.join(ROOT_DIR, DATASET_DIR, DEST_SPLITS,filename)
    subset.to_csv(subset_manifest, sep='\t')
    subprocess.call(['./gdc-client', 'download', '-m',subset_manifest, '-d', tmp_dest_path, '-n', '16'])

    for file in glob.glob(os.path.join(tmp_dest_path, '*/*.svs')):
        shutil.move(file, dest_path)
    shutil.rmtree(tmp_dest_path)


100% [############################################] Time:  0:01:01  13.7 MiB/s 
100% [############################################] Time:  0:00:14   8.9 MiB/s 
100% [############################################] Time:  0:00:06  14.6 MiB/s 
100% [############################################] Time:  0:00:16   4.6 MiB/s 
100% [######################################] Time: -1 day, 23:00:13   0.0 s/B 
100% [############################################] Time:  0:00:23   3.7 MiB/s 
100% [############################################] Time:  0:00:05   8.7 MiB/s 
100% [############################################] Time:  0:00:21   3.6 MiB/s 
100% [############################################] Time:  0:00:19   4.0 MiB/s 
100% [############################################] Time:  0:00:12  14.7 MiB/s 
100% [############################################] Time:  0:00:31   1.4 MiB/s 
100% [############################################] Time:  0:00:03  11.0 MiB/s 
100% [##################################

100% [############################################] Time:  0:00:12  16.3 MiB/s 
100% [############################################] Time:  0:00:29   6.8 MiB/s 
100% [############################################] Time:  0:03:39  13.3 MiB/s 
100% [############################################] Time:  0:00:49   7.9 MiB/s 
100% [############################################] Time:  0:00:08   7.9 MiB/s 
100% [############################################] Time:  0:00:18   7.1 MiB/s 
100% [############################################] Time:  0:00:09   7.2 MiB/s 
100% [############################################] Time:  0:00:15   6.4 MiB/s 
100% [############################################] Time:  0:01:29  13.3 MiB/s 
100% [############################################] Time:  0:00:15   5.7 MiB/s 
100% [############################################] Time:  0:00:07   8.2 MiB/s 
100% [############################################] Time:  0:00:05  11.5 MiB/s 
100% [##################################

100% [############################################] Time:  0:00:05   9.4 MiB/s 
100% [############################################] Time:  0:00:07  10.5 MiB/s 
100% [############################################] Time:  0:00:07  17.6 MiB/s 
100% [############################################] Time:  0:00:33   7.1 MiB/s 
100% [############################################] Time:  0:00:15   5.1 MiB/s 
100% [############################################] Time:  0:00:31  10.9 MiB/s 
100% [############################################] Time:  0:00:16   2.6 MiB/s 
100% [############################################] Time:  0:00:15   7.3 MiB/s 
100% [############################################] Time:  0:00:13   6.4 MiB/s 
100% [############################################] Time:  0:00:05   7.6 MiB/s 
100% [############################################] Time:  0:00:28  11.0 MiB/s 
100% [############################################] Time:  0:00:14  11.7 MiB/s 
100% [##################################

100% [############################################] Time:  0:00:10   5.1 MiB/s 
100% [############################################] Time:  0:00:23   2.6 MiB/s 
100% [############################################] Time:  0:00:08   4.5 MiB/s 
100% [############################################] Time:  0:00:30   1.7 MiB/s 
100% [############################################] Time:  0:00:15   6.1 MiB/s 
100% [############################################] Time:  0:00:10   8.3 MiB/s 
100% [############################################] Time:  0:00:05   6.7 MiB/s 
100% [############################################] Time:  0:00:38   1.8 MiB/s 
100% [############################################] Time:  0:00:06   5.5 MiB/s 
100% [############################################] Time:  0:00:09   4.4 MiB/s 
100% [############################################] Time:  0:00:22   8.3 MiB/s 
100% [############################################] Time:  0:00:26   2.8 MiB/s 
100% [##################################

# train valid test split

In [11]:
import glob
import os
import pandas as pd
import numpy as np


dfs = []
for filepath in glob.glob(os.path.join(ROOT_DIR, DATASET_DIR, DEST_SPLITS, '*.txt')):
    project, label = os.path.basename(filepath).replace('.txt','').split('_')
    df = pd.read_csv(filepath, sep='\t', index_col=0)
    df['label'] = label
    dfs.append(df)
    #X, y = df.filename.values.reshape(-1,1), df.label.values.reshape(-1,1)

In [12]:
df = pd.concat(dfs, axis=0).reset_index(drop=True)
X, y = df.filename.values.reshape(-1,1), df.label.values.reshape(-1,1)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1,  random_state=42, shuffle=True, stratify=y)


X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.111,  random_state=42, shuffle=True, stratify=y_train)


In [14]:
def write_split(split_name, X, y):
    with open(split_name, 'w') as f:
        for x_name, y_label in zip(X, y):
            f.write(f'{x_name.item().replace(".svs", "") }\t{y_label.item()}\n')


# TODO MAP LABELS TO ID
TRAIN_SPLIT_NAME = 'train_set.txt'
VAL_SPLIT_NAME = 'val_set.txt'
TEST_SPLIT_NAME = 'test_set.txt'
split_names = ['train_set.txt', 'val_set.txt', 'test_set.txt']
os.makedirs(os.path.join(ROOT_DIR, DATASET_DIR, f'{DATASET_DIR}_splits') , exist_ok=True)
DATASET_METADATA_DIR = os.path.join(ROOT_DIR, DATASET_DIR, f'{DATASET_DIR}_splits')
write_split(os.path.join(DATASET_METADATA_DIR, TRAIN_SPLIT_NAME), X_train, y_train)
write_split(os.path.join(DATASET_METADATA_DIR, VAL_SPLIT_NAME), X_val, y_val)
write_split(os.path.join(DATASET_METADATA_DIR, TEST_SPLIT_NAME), X_test, y_test)

In [15]:
label_map = dict()
for i, label in enumerate(np.unique(y)):
    label_map[label] = i
import pickle
LABEL_MAP_PATH = os.path.join(DATASET_METADATA_DIR, 'label_map.pkl')
with open(LABEL_MAP_PATH, 'wb') as f:
    pickle.dump(label_map, f)

In [66]:
pickle.load(open(LABEL_MAP_PATH, 'rb') )

{'LUAD': 0, 'LUSC': 1}