In [2]:
from time import sleep

import contextlib
import sys

from tqdm import tqdm

class DummyFile(object):
    file = None
    def __init__(self, file):
        self.file = file

    def write(self, x):
        # Avoid print() second call (useless \n)
        if len(x.rstrip()) > 0:
            tqdm.write(x, file=self.file)

@contextlib.contextmanager
def nostdout():
    save_stdout = sys.stdout
    sys.stdout = DummyFile(sys.stdout)
    yield
    sys.stdout = save_stdout

def blabla():
    print("Foo blabla")

# tqdm call to sys.stdout must be done BEFORE stdout redirection
# and you need to specify sys.stdout, not sys.stderr (default)
for _ in tqdm(range(3), file=sys.stdout):
    with nostdout():
        blabla()
        sleep(.5)

print('Done!')

AttributeError: 'str' object has no attribute 'write'

In [8]:
import sys
from time import sleep
from tqdm import tqdm

values = range(3)
with tqdm(total=len(values), file=sys.stdout) as pbar:
    for i in values:
        pbar.set_description('processed: %d' % (1 + i))
        pbar.update(1)
        sleep(1)


  0%|          | 0/3 [00:00<?, ?it/s][A
processed: 1:   0%|          | 0/3 [00:00<?, ?it/s][A
processed: 2:  33%|███▎      | 1/3 [00:01<00:02,  1.00s/it][A
processed: 2:  67%|██████▋   | 2/3 [00:01<00:00,  1.99it/s][A
processed: 3:  67%|██████▋   | 2/3 [00:02<00:00,  1.99it/s][A
processed: 3: 100%|██████████| 3/3 [00:03<00:00,  1.01s/it][A


In [26]:
import h5py
import numpy as np

sample_training_size, sample_valid_size, sample_test_size = 3200, 400, 400

sample_h5_file = {}

with h5py.File('data/sample_dataset.h5', 'w') as hdf_w:
    with h5py.File('data/er.h5', 'r') as hdf:
        ls = list(hdf.keys())
        print('List of datasets in this file:\n  {}\n'.format("\n  ".join(ls)))

        hdf_w.create_dataset('target_labels', data=list(hdf['target_labels']))
         
        train_in = hdf.get('train_in')
        train_out = hdf.get('train_out')
        print("Shape of the input training data is:", train_in.shape)
        print("Shape of the output training data is:", train_out.shape)
        indx = np.random.choice(range(len(train_in)), sample_training_size, replace=False)
        hdf_w.create_dataset('train_in', data=train_in[sorted(list(indx))])
        hdf_w.create_dataset('train_out', data=train_out[sorted(list(indx))])

        valid_in = hdf.get('valid_in')
        valid_out = hdf.get('valid_out')
        print("Shape of the input velidation data is:", valid_in.shape)
        print("Shape of the output velidation data is:", valid_out.shape)
        indx = np.random.choice(range(len(valid_in)), sample_valid_size, replace=False)
        hdf_w.create_dataset('valid_in', data=valid_in[sorted(list(indx))])
        hdf_w.create_dataset('valid_out', data=valid_out[sorted(list(indx))])

        test_in = hdf.get('test_in')
        test_out = hdf.get('test_out')
        print("Shape of the input test data is:", test_in.shape)
        print("Shape of the output test data is:", test_out.shape)
        indx = np.random.choice(range(len(test_in)), sample_test_size, replace=False)
        hdf_w.create_dataset('test_in', data=test_in[sorted(list(indx))])
        hdf_w.create_dataset('test_out', data=test_out[sorted(list(indx))])
        test_headers_org = list(hdf['test_headers'])
        test_headers = [test_headers_org[i] for i in sorted(list(indx))]
        hdf_w.create_dataset('test_headers', data=test_headers)

List of datasets in this file:
  target_labels
  test_headers
  test_in
  test_out
  train_in
  train_out
  valid_in
  valid_out

Shape of the input training data is: (530925, 4, 1, 600)
Shape of the output training data is: (530925, 164)
Shape of the input velidation data is: (70000, 4, 1, 600)
Shape of the output velidation data is: (70000, 164)
Shape of the input test data is: (71886, 4, 1, 600)
Shape of the output test data is: (71886, 164)


In [28]:
with h5py.File('data/sample_dataset.h5', 'r') as hdf:
    ls = list(hdf.keys())
    print('List of datasets in this file:\n  {}\n'.format("\n  ".join(ls)))

    train_in = hdf.get('train_in')
    train_out = hdf.get('train_out')
    print("Shape of the input training data is:", train_in.shape)
    print("Shape of the output training data is:", train_out.shape)

    valid_in = hdf.get('valid_in')
    valid_out = hdf.get('valid_out')
    print("Shape of the input velidation data is:", valid_in.shape)
    print("Shape of the output velidation data is:", valid_out.shape)

    test_in = hdf.get('test_in')
    test_out = hdf.get('test_out')
    print("Shape of the input test data is:", test_in.shape)
    print("Shape of the output test data is:", test_out.shape)

List of datasets in this file:
  target_labels
  test_headers
  test_in
  test_out
  train_in
  train_out
  valid_in
  valid_out

Shape of the input training data is: (3200, 4, 1, 600)
Shape of the output training data is: (3200, 164)
Shape of the input velidation data is: (400, 4, 1, 600)
Shape of the output velidation data is: (400, 164)
Shape of the input test data is: (400, 4, 1, 600)
Shape of the output test data is: (400, 164)


In [67]:
import os
import h5py
from torch.utils.data import Dataset, DataLoader

class BassetDataset(Dataset):

    # Initializes the BassetDataset
    def __init__(self, path='./data/', f5name='sample_dataset.h5', split='train', transform=None):
        """
        Args:
            :param path: path to HDF5 file
            :param f5name: HDF5 file name
            :param split: split that we are interested to work with
            :param transform (callable, optional): Optional transform to be applied on a sample
        """
        
        self.split = split
        
        split_dict = {'train': ['train_in', 'train_out'], 
                      'test': ['test_in', 'test_out'], 
                      'valid': ['valid_in', 'valid_out']}
        
        assert self.split in split_dict, "'split' argument can be only defined as 'train', 'valid' or 'test'"
        
        # Open hdf5 file where one-hoted data are stored
        self.dataset = h5py.File(os.path.join(path, f5name.format(self.split)), 'r')
        
        # Keeping track of the names of the target labels
        self.target_labels = self.dataset['target_labels']
        
        # Get the list of volumes
        self.inputs = self.dataset[split_dict[split][0]]
        self.outputs = self.dataset[split_dict[split][1]]
        if self.split!='test':
            self.ids = list(range(len(self.inputs)))
        else:
            self.ids = np.char.decode(self.dataset['test_headers'])
            
    def __getitem__(self, i):
        
        id = self.ids[i]

        # Sequence & Target
        sequence, target = self.inputs[id], self.outputs[id]

        return sequence, target

    def __len__(self):
        return len(self.ids)

In [75]:
# path='./data/', f5name='sample_dataset.h5', split='test'
basset_dataset_train = BassetDataset(path='./data/', f5name='sample_dataset.h5', split='train')
print("The number of samples in {} split is {}.\n".format('train', len(basset_dataset_train)))

basset_dataset_train = BassetDataset(path='./data/', f5name='sample_dataset.h5', split='valid')
print("The number of samples in {} split is {}.\n".format('valid', len(basset_dataset_train)))

basset_dataset_test = BassetDataset(path='./data/', f5name='sample_dataset.h5', split='test')
print("The number of samples in {} split is {}.".format('test', len(basset_dataset_test)))
print("The first 10 ids of test samples are:\n  {}\n".format("\n  ".join(basset_dataset_test.ids[:10])))

batch_size = 64
basset_dataloader_train = DataLoader(basset_dataset, batch_size=batch_size, drop_last=True, shuffle=True, num_workers=1)

The number of samples in train split is 3200.

The number of samples in valid split is 400.

The number of samples in test split is 400.
The first 10 ids of test samples are:
  chr4:160811195-160811795(+)
  chr4:110154618-110155218(+)
  chr1:182525355-182525955(+)
  chr4:104167020-104167620(+)
  chr13:45964352-45964952(+)
  chr12:107869980-107870580(+)
  chr12:3902440-3903040(+)
  chr1:143171000-143171600(+)
  chr12:125819735-125820335(+)
  chr1:23439640-23440240(+)



In [80]:
seqs, trgs = next(iter(basset_dataloader_train)) # Training, validation and test would done on seqs, trgs

In [81]:
print("Shape of the batch for input: {}".format(seqs.shape))
print("Shape of the batch for output: {}".format(trgs.shape))

Shape of the batch for input: torch.Size([64, 4, 1, 600])
Shape of the batch for output: torch.Size([64, 164])
