Skip to content

Commit

Permalink
Merge 8b3777d into 33d37ae
Browse files Browse the repository at this point in the history
  • Loading branch information
netsettler committed Jul 18, 2020
2 parents 33d37ae + 8b3777d commit ea10818
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 41 deletions.
29 changes: 15 additions & 14 deletions dcicutils/s3_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@
logger = logging.getLogger(__name__)


class s3Utils(object):
class s3Utils(object): # NOQA - This class name violates style rules, but a lot of things might break if we change it.

def __init__(self, outfile_bucket=None, sys_bucket=None, raw_file_bucket=None,
blob_bucket=None, env=None):
'''
"""
if we pass in env set the outfile and sys bucket from the environment
'''
"""
# avoid circular ref
from .beanstalk_utils import get_beanstalk_real_url
self.url = ''
Expand Down Expand Up @@ -101,10 +101,10 @@ def does_key_exist(self, key, bucket=None, print_error=True):

def get_file_size(self, key, bucket=None, add_bytes=0, add_gb=0,
size_in_gb=False):
'''
"""
default returns file size in bytes,
unless size_in_gb = True
'''
"""
meta = self.does_key_exist(key, bucket)
if not meta:
raise Exception("key not found")
Expand All @@ -120,15 +120,16 @@ def delete_key(self, key, bucket=None):
bucket = self.outfile_bucket
self.s3.delete_object(Bucket=bucket, Key=key)

def size(self, bucket):
@classmethod
def size(cls, bucket):
sbuck = boto3.resource('s3').Bucket(bucket)
# get only head of objects so we can count them
return sum(1 for _ in sbuck.objects.all())

def s3_put(self, obj, upload_key, acl=None):
'''
"""
try to guess content type
'''
"""
content_type = mimetypes.guess_type(upload_key)[0]
if content_type is None:
content_type = 'binary/octet-stream'
Expand Down Expand Up @@ -165,9 +166,9 @@ def s3_delete_dir(self, prefix):
files = obj_list.get('Contents', [])

# morph file list into format that boto3 wants
delete_keys = {'Objects': []}
delete_keys['Objects'] = [{'Key': k} for k in
[obj['Key'] for obj in files]]
delete_keys = {'Objects': [{'Key': k}
for k in [obj['Key']
for obj in files]]}

# second query deletes all the files, NOTE: Max 1000 files
if delete_keys['Objects']:
Expand All @@ -189,7 +190,7 @@ def read_s3_zipfile(self, s3key, files_to_extract):

def unzip_s3_to_s3(self, zipped_s3key, dest_dir, acl=None, do_not_store=False):
"""stream the content of a zipped key on S3 to another location on S3.
if do_not_store=false, it saves the content and return it in the dictionary format
if do_not_store=false, it saves the content and returns it in the dictionary format
(default)
"""

Expand Down Expand Up @@ -222,9 +223,9 @@ def unzip_s3_to_s3(self, zipped_s3key, dest_dir, acl=None, do_not_store=False):
s3_file_name = dest_dir + file_name
s3_key = "https://s3.amazonaws.com/%s/%s" % (self.outfile_bucket, s3_file_name)
# just perf optimization so we don't have to copy
# files twice that we want to further interogate
# files twice that we want to further interrogate
the_file = zipstream.open(file_name, 'r').read()
file_to_find = file_name.split('/')[-1]
file_to_find = os.path.basename(file_name)
if not do_not_store:
ret_files[file_to_find] = {'s3key': s3_key,
'data': the_file}
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicutils"
version = "0.34.2"
version = "0.34.0"
description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
license = "MIT"
Expand Down
7 changes: 4 additions & 3 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
INTEGRATED_ES = 'https://search-fourfront-mastertest-wusehbixktyxtbagz5wzefffp4.us-east-1.es.amazonaws.com'


TEST_DIR = os.path.join(os.path.dirname(__file__))

@pytest.fixture(scope='session')
def basestring():
try:
Expand Down Expand Up @@ -36,7 +38,6 @@ def integrated_ff():
'the homepage gave status of: %s' % (INTEGRATED_ENV, res.status_code))
return integrated


@pytest.fixture(scope='session')
def integrated_s3_info():
"""
Expand All @@ -50,9 +51,9 @@ def integrated_s3_info():
# for now, always upload these files
s3Obj.s3.put_object(Bucket=s3Obj.outfile_bucket, Key=test_filename,
Body=str.encode('thisisatest'))
zip_path = os.path.join('test', 'data_files', zip_filename.split('/')[-1])
zip_path = os.path.join(TEST_DIR, 'data_files', os.path.basename(zip_filename))
s3Obj.s3.upload_file(Filename=str(zip_path), Bucket=s3Obj.outfile_bucket, Key=zip_filename)
zip_path2 = os.path.join('test', 'data_files', zip_filename2.split('/')[-1])
zip_path2 = os.path.join(TEST_DIR, 'data_files', os.path.basename(zip_filename2))
s3Obj.s3.upload_file(Filename=str(zip_path2), Bucket=s3Obj.outfile_bucket, Key=zip_filename2)

return {'s3Obj': s3Obj, 'filename': test_filename, 'zip_filename': zip_filename,
Expand Down
54 changes: 31 additions & 23 deletions test/test_s3_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,59 +184,67 @@ def test_read_s3_zip(integrated_s3_info):


def test_unzip_s3_to_s3(integrated_s3_info):
'''test for unzip_s3_to_s3 with case where there is a basdir'''
"""test for unzip_s3_to_s3 with case where there is a basdir"""

prefix = '__test_data/extracted'
filename = integrated_s3_info['zip_filename']
integrated_s3_info['s3Obj'].s3_delete_dir(prefix)
s3_connection = integrated_s3_info['s3Obj']

# start with a clean test space
s3_connection.s3_delete_dir(prefix)

# ensure this thing was deleted
# if no files there will be no Contents in response
objs = integrated_s3_info['s3Obj'].s3_read_dir(prefix)
assert [] == objs.get('Contents', [])
objs = s3_connection.s3_read_dir(prefix)
assert not objs.get('Contents')

# now copy to that dir we just deleted
ret_files = integrated_s3_info['s3Obj'].unzip_s3_to_s3(filename, prefix)
ret_files = s3_connection.unzip_s3_to_s3(filename, prefix)
assert ret_files['fastqc_report.html']['s3key'].startswith("https://s3.amazonaws.com")

objs = integrated_s3_info['s3Obj'].s3_read_dir(prefix)
assert objs.get('Contents', None)
objs = s3_connection.s3_read_dir(prefix)
assert objs.get('Contents')


def test_unzip_s3_to_s3_2(integrated_s3_info):
'''test for unzip_s3_to_s3 with case where there is no basdir'''
"""test for unzip_s3_to_s3 with case where there is no basdir"""

prefix = '__test_data/extracted'
filename = integrated_s3_info['zip_filename2']
integrated_s3_info['s3Obj'].s3_delete_dir(prefix)
s3_connection = integrated_s3_info['s3Obj']

s3_connection.s3_delete_dir(prefix)

# ensure this thing was deleted
# if no files there will be no Contents in response
objs = integrated_s3_info['s3Obj'].s3_read_dir(prefix)
assert [] == objs.get('Contents', [])
objs = s3_connection.s3_read_dir(prefix)
assert not objs.get('Contents')

# now copy to that dir we just deleted
ret_files = integrated_s3_info['s3Obj'].unzip_s3_to_s3(filename, prefix)
ret_files = s3_connection.unzip_s3_to_s3(filename, prefix)
assert ret_files['qc_report.html']['s3key'].startswith("https://s3.amazonaws.com")
assert ret_files['qc_report.html']['s3key'].endswith("qc_report.html")

objs = integrated_s3_info['s3Obj'].s3_read_dir(prefix)
assert objs.get('Contents', None)
objs = s3_connection.s3_read_dir(prefix)
assert objs.get('Contents')


def test_unzip_s3_to_s3(integrated_s3_info):
'''test for unzip_s3_to_s3 with case where there is a basdir and
do_not_store=True'''
def test_unzip_s3_to_s3_do_not_store(integrated_s3_info):
"""test for unzip_s3_to_s3 with case where there is a basdir and do_not_store=True"""
prefix = '__test_data/extracted'
filename = integrated_s3_info['zip_filename']
integrated_s3_info['s3Obj'].s3_delete_dir(prefix)
s3_connection = integrated_s3_info['s3Obj']

s3_connection.s3_delete_dir(prefix)

# ensure this thing was deleted
# if no files there will be no Contents in response
objs = integrated_s3_info['s3Obj'].s3_read_dir(prefix)
assert [] == objs.get('Contents', [])
objs = s3_connection.s3_read_dir(prefix)
assert not objs.get('Contents')

# now copy to that dir we just deleted
ret_files = integrated_s3_info['s3Obj'].unzip_s3_to_s3(filename, prefix, do_not_store=True)
ret_files = s3_connection.unzip_s3_to_s3(filename, prefix, do_not_store=True)
assert len(ret_files) == 0 # no returned content

objs = integrated_s3_info['s3Obj'].s3_read_dir(prefix)
assert objs.get('Contents', None)
objs = s3_connection.s3_read_dir(prefix)
assert objs.get('Contents')

0 comments on commit ea10818

Please sign in to comment.