## HCA BDBag export proof-of-principle

Create a BDBag with two files residing in `azul/rfc/bdbag` (see below). Then compress the bag and upload it to S3. Get a signed URL from the bag on S3 and download the bag.

In [1]:
import os, bagit, boto3, sys, tempfile, uuid, urllib.request
from shutil import rmtree, copy, copyfileobj
from zipfile import ZipFile, ZIP_DEFLATED
from filecmp import dircmp

In order to upload data to a FireCloud workspace we need two files, named `participant.tsv` and `sample.tsv`. The first file holds a unique list of donor UUIDs, and the second file the same donor UUIDs with their corresponding specimen UUIDs. The values in these two columns of the table are the composite primary key. The third column in `sample.tsv` holds cell-suspension UUIDs, followed by other columns which I copied from the manifest downloaded from the Explorer of the HCA browser.

In [2]:
bag_name = 'hca_manifest'
tempd = tempfile.mkdtemp('bag_tmpd')
bag_dir = os.path.join(tempd, bag_name)
data_path = os.path.join(bag_dir, 'data')
os.makedirs(data_path)
print('bag_dir: {},\ndata_path: {}'.format(bag_dir, data_path))

bag = bagit.make_bag(bag_dir, {'info': 'some info'})  # use bagit module to create a bag instance

def listfiles(basepath):
    """Return list of file names only, skip directory names found in basepath."""
    filelist = []
    for fname in os.listdir(basepath):
        path = os.path.join(basepath, fname)
        if os.path.isdir(path):
            continue  # skip directories
        filelist.append(os.path.basename(path))
    return filelist

# Copy TSV files from current directory into the data directory of the bag.
files = list(filter(lambda x: x.endswith('.tsv'), listfiles(os.getcwd())))
for file in files:
    copy(file, data_path)
assert ['participant.tsv', 'sample.tsv'] == listfiles(data_path)

bag.save(manifests=True)
assert bag.is_valid()
print('bag_dir: {},\ndata_path: {}'.format(os.listdir(bag_dir), listfiles(data_path)))

bag_dir: /tmp/tmpq2wojnjibag_tmpd/hca_manifest,
data_path: /tmp/tmpq2wojnjibag_tmpd/hca_manifest/data
bag_dir: ['bagit.txt', 'tagmanifest-sha256.txt', 'manifest-sha512.txt', 'tagmanifest-sha512.txt', 'bag-info.txt', 'manifest-sha256.txt', 'data'],
data_path: ['participant.tsv', 'sample.tsv']


### Create a zip-file from a directory
(Inspired by: https://stackoverflow.com/questions/1855095/how-to-create-a-zip-archive-of-a-directory)

In [3]:
def zipdir(path, zip_fh):
    # zip_fh is zipfile handle
    path_length = len(path)
    for root, dirs, files in os.walk(path):
        for file in files:
            zip_fh.write(os.path.join(root, file),
                         arcname=os.path.join(root[path_length:], file))

In [4]:
zipfile_tmp = tempfile.NamedTemporaryFile(suffix='.zip', delete=False)
zipfile_handle = ZipFile(zipfile_tmp.file, 'w', ZIP_DEFLATED)
zipdir(tempd, zipfile_handle)
zipfile_handle.close()
print(zipfile_tmp.name)

/tmp/tmpm9edy60r.zip


### Upload zipped bag to S3

In [5]:
aws_profile = os.getenv('AWS_PROFILE')
bucket_name = os.getenv('AZUL_S3_BUCKET')
key = str(uuid.uuid4()) + '.zip'
if aws_profile is None or bucket_name is None:
    sys.exit("Check env vars - aborting.")
session = boto3.Session(profile_name=aws_profile)
s3 = session.resource('s3')
try:
    s3.meta.client.upload_file(Filename=zipfile_tmp.name,
                               Bucket=bucket_name,
                               Key=key)
except Exception as e:
    print(e)
rmtree(tempd, True)
os.remove(zipfile_tmp.name)

### Download compressed bag from bucket (identified by `key`) from S3

In [6]:
my_bucket = s3.Bucket(bucket_name)
for item in my_bucket.objects.all():
    if item.key == key:
        bucket_name = item.bucket_name  # should be same as above
try:
    s3.meta.client.download_file(bucket_name, key, './bag.zip')
except Exception as e:
    print(e)
assert 'bag.zip' in os.listdir()

### Unzip bag (name is still `hca_manifest`) and list its content

In [7]:
with ZipFile('bag.zip','r') as zip_ref:
    zip_ref.extractall('.')
os.listdir(bag_name)

['bagit.txt',
 'tagmanifest-sha256.txt',
 'manifest-sha512.txt',
 'tagmanifest-sha512.txt',
 'bag-info.txt',
 'manifest-sha256.txt',
 'data']

### Generate signed URL

In [8]:
aws_region = os.getenv('AWS_DEFAULT_REGION')
session = boto3.session.Session(region_name=aws_region)
s3Client = session.client('s3')
params = {'Bucket': bucket_name, 'Key': key}
expiration_in_secs = 60
url = s3Client.generate_presigned_url('get_object', 
                                      Params = params, 
                                      ExpiresIn = expiration_in_secs)

### Download file using signed URL

In [9]:
os.rename('hca_manifest', 'hca_manifest_original')
with urllib.request.urlopen(url) as response, open('bag_from_url.zip', 'wb') as out_file:
    copyfileobj(response, out_file)
assert 'bag_from_url.zip' in os.listdir()

In [10]:
with ZipFile('bag_from_url.zip','r') as zip_ref:
    zip_ref.extractall('.')
os.listdir(bag_name)

['bagit.txt',
 'tagmanifest-sha256.txt',
 'manifest-sha512.txt',
 'tagmanifest-sha512.txt',
 'bag-info.txt',
 'manifest-sha256.txt',
 'data']

### Compare the original bag with the one downloaded using the signed URL

In [11]:
def print_diff_files(dcmp):
    for name in dcmp.diff_files:
        print("diff_file %s found in %s and %s" % (name, dcmp.left,
               dcmp.right))
    for sub_dcmp in dcmp.subdirs.values():
        print_diff_files(sub_dcmp)
dcmp = dircmp('hca_manifest', 'hca_manifest_original') 
assert print_diff_files(dcmp) is None

## Clean up

In [12]:
_dirs = [x for x in os.listdir() if x.startswith('hca_mani')] + [x for x in os.listdir() if x.endswith('.zip')]
for _dir in _dirs:
    if os.path.isdir(_dir):
        rmtree(_dir)
    else:
        os.remove(_dir)