## HCA BDBag export proof-of-principle

Create a BDBag with two files residing in `azul/rfc/bdbag` (see below). Then compress the bag and upload it to S3. Get a signed URL from the bag on S3 and download the bag.

In [1]:
import os, bagit, boto3, sys, tempfile, uuid, urllib.request
from bdbag import bdbag_api
from shutil import rmtree, copy, copyfileobj
from zipfile import ZipFile, ZIP_DEFLATED
from filecmp import dircmp

In order to upload data to a FireCloud workspace we need two files, named `participant.tsv` and `sample.tsv`. The first file holds a unique list of donor UUIDs, and the second file the same donor UUIDs with their corresponding specimen UUIDs. The values in these two columns of the table are the composite primary key. The third column in `sample.tsv` holds cell-suspension UUIDs, followed by other columns which I copied from the manifest downloaded from the Explorer of the HCA browser.

### Create instance of a `bdbag_api` _bag_ object

In [2]:
original_dir_list = os.listdir()
bag_path = tempfile.mkdtemp('_bdbag')  # temporary file name ends with `_bdbag`
bag = bdbag_api.make_bag(bag_path)
assert os.listdir(os.path.join(bag_path, 'data')) == []  # has data dir. but it's empty
print(bag_path)

/tmp/tmp68mhhvwq_bdbag


In [3]:
data_path = os.path.join(bag_path, 'data')

def listfiles(basepath):
    """Return list of file names only, skip directory names found in basepath."""
    filelist = []
    for fname in os.listdir(basepath):
        path = os.path.join(basepath, fname)
        if os.path.isdir(path):
            continue  # skip directories
        filelist.append(os.path.basename(path))
    return filelist

# Copy TSV files from current directory into the data directory of the bag.
files = list(filter(lambda x: x.endswith('.tsv'), listfiles(os.getcwd())))
for file in files:
    copy(file, data_path)
assert ['participant.tsv', 'sample.tsv'] == listfiles(data_path)
bag = bdbag_api.make_bag(bag_path, update=True)  # write checksums into respective files
assert bdbag_api.is_bag(bag_path)
bdbag_api.validate_bag(bag_path)
assert bdbag_api.check_payload_consistency(bag)

### Compress bag

In [4]:
arc_path = bdbag_api.archive_bag(bag_path, 'zip')
assert arc_path == bag_path + '.zip'  # arc_path has same basename and .zip
print(arc_path)

/tmp/tmp68mhhvwq_bdbag.zip


### Upload zipped bag to S3

In [5]:
aws_profile = os.getenv('AWS_PROFILE')
bucket_name = os.getenv('AZUL_S3_BUCKET')
key = str(uuid.uuid4()) + '.zip'
if aws_profile is None or bucket_name is None:
    sys.exit("Check env vars - aborting.")
session = boto3.Session(profile_name=aws_profile)
s3 = session.resource('s3')
try:
    s3.meta.client.upload_file(Filename=arc_path,
                               Bucket=bucket_name,
                               Key=key)
except Exception as e:
    print(e)
rmtree(bag_path, ignore_errors=True)
os.remove(arc_path)

### Download compressed bag from bucket (identified by `key`) from S3
The original bag named `bag_path` is in `/tmp` and we previously uploaded it to S3. Here we download that bag and write it to the _current working directory_. 

In [6]:
my_bucket = s3.Bucket(bucket_name)
for item in my_bucket.objects.all():
    if item.key == key:
        bucket_name = item.bucket_name  # should be same as above
try:
    s3.meta.client.download_file(bucket_name, key, './bag.zip')
except Exception as e:
    print(e)
assert 'bag.zip' in os.listdir()

### Unzip bag and list its content
The bag's name is (still) `bag_path`. 

In [7]:
with ZipFile('bag.zip','r') as zip_ref:
    zip_ref.extractall('.')
assert os.path.basename(bag_path) in os.listdir()

### Generate signed URL

In [8]:
aws_region = os.getenv('AWS_DEFAULT_REGION')
session = boto3.session.Session(region_name=aws_region)
s3Client = session.client('s3')
params = {'Bucket': bucket_name, 'Key': key}
expiration_in_secs = 60
url = s3Client.generate_presigned_url('get_object', 
                                      Params = params, 
                                      ExpiresIn = expiration_in_secs)

### Download file using signed URL

In [9]:
try:
    os.path.basename(bag_path) in os.listdir()
except FileNotFoundError as e:
    print(e)

In [10]:
bag_path_original = os.path.basename(bag_path) 
bag_path = os.path.basename(bag_path)
os.rename(bag_path_original, bag_path_original + '_original')
try:
    with urllib.request.urlopen(url) as response, open('bag_from_url.zip', 'wb') as out_file:
        copyfileobj(response, out_file)
except HTTPerror as err:
    print(err)
assert 'bag_from_url.zip' in os.listdir()

In [11]:
with ZipFile('bag_from_url.zip','r') as zip_ref:
    zip_ref.extractall('.')

### Compare the original bag with the one downloaded using the signed URL

In [12]:
def print_diff_files(dcmp):
    for name in dcmp.diff_files:
        print("diff_file %s found in %s and %s" % (name, dcmp.left,
               dcmp.right))
    for sub_dcmp in dcmp.subdirs.values():
        print_diff_files(sub_dcmp)
dcmp = dircmp(bag_path, bag_path_original) 
assert print_diff_files(dcmp) is None

## Clean up

In [13]:
bdbag_api.cleanup_bag(bag_path)
_dirs = [x for x in os.listdir() if x.startswith('tmp')] + [x for x in os.listdir() if x.endswith('.zip')]
for _dir in _dirs:
    if os.path.isdir(_dir):
        rmtree(_dir)
    else:
        os.remove(_dir)
current_dir_list = os.listdir()
assert original_dir_list == current_dir_list