Skip to content

Commit

Permalink
WASAPI Include 'open' WARCs (#754)
Browse files Browse the repository at this point in the history
* wasapi tweaks:
- if recording is active, set 'is_active' file field to true, indicating WARC *may* change in the future.
- if recording is active and there is pending traffic (pending size >0), also change extension to '.warc.gz.open' to indicate WARC *will* change in the future.
- for active WARCs include only local path, md5 hash
- rename 'etag' -> 's3etag' hash type for clarity
- if S3 etag has no '-', indicate that its really an md5, otherwise s3etag
  • Loading branch information
ikreymer committed Oct 10, 2019
1 parent 1326190 commit 3100054
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 21 deletions.
2 changes: 1 addition & 1 deletion webrecorder/test/test_api_user_login.py
Expand Up @@ -493,7 +493,7 @@ def test_openapi_spec_json(self):

get = web_data.get('get')
assert get is not None
assert len(get.get('parameters')) == 3
assert len(get.get('parameters')) == 2
assert len(get.get('tags')) == 1
assert 'WASAPI' in get.get('responses').get('200').get('description')
assert get.get('tags')[0] == 'WASAPI (Downloads)'
Expand Down
11 changes: 9 additions & 2 deletions webrecorder/test/test_register_migrate.py
Expand Up @@ -304,10 +304,15 @@ def test_logged_in_download(self):

def test_wasapi_list(self):
res = self.testapp.get('/api/v1/download/webdata')
assert len(res.json['files']) == 1
assert len(res.json['files']) == 2
assert res.json['files'][0]['checksums']
assert res.json['files'][0]['locations']

assert res.json['files'][1]['checksums']
assert res.json['files'][1]['locations']

assert sum((1 if val['is_active'] else 0 for val in res.json['files']), 0) == 1

wasapi_filename = res.json['files'][0]['locations'][0]
res = self.testapp.head(urlsplit(wasapi_filename).path)

Expand Down Expand Up @@ -463,10 +468,12 @@ def test_logged_out_wasapi_list_basic_auth(self):
res = self.testapp.get('/api/v1/download/webdata')
self.testapp.authorization = None

assert len(res.json['files']) == 1
assert len(res.json['files']) == 3
assert res.json['files'][0]['checksums']
assert res.json['files'][0]['locations']

assert sum((1 if val['is_active'] else 0 for val in res.json['files']), 0) == 2

wasapi_filename = res.json['files'][0]['locations'][0]

# 404 without basic auth
Expand Down
13 changes: 9 additions & 4 deletions webrecorder/test/test_storage_commit.py
Expand Up @@ -128,14 +128,14 @@ def test_wasapi_list(self):
result = self.redis.hgetall('c:{coll}:warc'.format(coll=COLL_ID))
assert res.json['files'][0].get('filename') == list(result.keys())[0]

self.assert_wasapi_locations(res.json['files'][0].get('locations', []), verify_only=True)
self.assert_wasapi_locations(res.json['files'][0], verify_only=True)

def test_wasapi_download(self):
assert self.redis.hget(REC_INFO, '@index_file') is not None
params = {'user': 'test'}
res = self.testapp.get('/api/v1/download/webdata', params=params)

self.assert_wasapi_locations(res.json['files'][0].get('locations', []), verify_only=False)
self.assert_wasapi_locations(res.json['files'][0], verify_only=False)

def test_create_new_coll(self):
# Collection
Expand Down Expand Up @@ -222,7 +222,9 @@ def assert_warc_key(self, key):
storage_dir = os.environ['STORAGE_ROOT'].replace(os.path.sep, '/')
assert storage_dir in key

def assert_wasapi_locations(self, locations, verify_only=True):
def assert_wasapi_locations(self, file_entry, verify_only=True):
locations = file_entry.get('locations', [])
assert list(file_entry.get('checksums').keys())[0] == 'md5'
assert len(locations) == 1
if verify_only:
return
Expand Down Expand Up @@ -301,7 +303,10 @@ def assert_deleted(self):
def assert_warc_key(self, key):
assert key.startswith(os.environ['S3_ROOT'])

def assert_wasapi_locations(self, locations, verify_only=True):
def assert_wasapi_locations(self, file_entry, verify_only=True):
locations = file_entry.get('locations', [])
hash_type = list(file_entry.get('checksums').keys())[0]
assert hash_type == 's3etag' or hash_type == 'md5'
assert len(locations) == 2
if verify_only:
return
Expand Down
2 changes: 1 addition & 1 deletion webrecorder/webrecorder/apiutils.py
Expand Up @@ -96,7 +96,6 @@ class WRAPISpec(object):
}

opt_bool_params = {
'commit': 'Force all non-committed recording to become committed',
'public': 'Publicly Accessible',
'include_recordings': 'Include Recording Sessions in response',
'include_lists': 'Include all lists in response',
Expand Down Expand Up @@ -139,6 +138,7 @@ class WRAPISpec(object):
'collection': {'type': 'string'},
'checksums': {'type': 'object'},
'locations': {'type': 'array', 'items': {'type': 'string'}},
'is_active': {'type': 'boolean'},
}
}
},
Expand Down
26 changes: 14 additions & 12 deletions webrecorder/webrecorder/downloadcontroller.py
Expand Up @@ -9,6 +9,7 @@
from webrecorder.apiutils import wr_api_spec
from webrecorder.models.stats import Stats
from webrecorder.utils import get_bool
from webrecorder.rec.storage import LocalFileStorage

from bottle import response, request
from six.moves.urllib.parse import quote
Expand Down Expand Up @@ -50,7 +51,7 @@ def logged_in_download_coll_warc(user, coll):

@self.app.get('/api/v1/download/webdata')
@self.api(
query=['?user', '?collection', '?commit'],
query=['?user', '?collection'],
resp='wasapi_list',
description='List all files available for download, their locations and checksums, per WASAPI spec'
)
Expand Down Expand Up @@ -224,7 +225,6 @@ def wasapi_list(self):

# some clients use collection rather than coll_name so we must check for both
coll_name = request.query.getunicode('collection')
commit = get_bool(request.query.getunicode('commit'))

user = self._get_wasapi_user()

Expand All @@ -244,18 +244,15 @@ def wasapi_list(self):

files = []
download_path = self.get_origin() + '/api/v1/download/{user}/{coll}/{filename}'
local_storage = LocalFileStorage(self.redis)

for collection in colls:
if commit:
commit_id = collection.commit_all()
while commit_id:
gevent.sleep(10)
commit_id = collection.commit_all(commit_id)
commit_storage = collection.get_storage()

storage = collection.get_storage()
for recording in collection.get_recordings():
if not recording.is_fully_committed():
continue
is_committed = recording.is_fully_committed()
is_open = not is_committed and recording.get_pending_count() > 0
storage = commit_storage if is_committed else local_storage

for name, path in recording.iter_all_files(include_index=False):
full_warc_path = collection.get_warc_path(name)
Expand All @@ -265,11 +262,15 @@ def wasapi_list(self):

# if remote download url exists (eg. for s3), include that first
# always include local download url as well
if remote_download_url:
if remote_download_url and is_committed:
locations = [remote_download_url, local_download]
else:
locations = [local_download]

# add .open if current pending requests, checksum will likely change
if is_open:
name += '.open'

kind, check_sum, size = storage.get_checksum_and_size(full_warc_path)
files.append({
'content-type': 'application/warc',
Expand All @@ -281,9 +282,10 @@ def wasapi_list(self):
'collection': collection.name,
'checksums': {kind: check_sum},
'locations': locations,
'is_active': not is_committed
})

return {'files': files, 'include-extra': True}
return {'files': files, 'include-extra': len(files) > 0}

def wasapi_download(self, username, coll_name, filename):
user = self._get_wasapi_user(username)
Expand Down
4 changes: 3 additions & 1 deletion webrecorder/webrecorder/rec/storage/s3.py
Expand Up @@ -158,6 +158,8 @@ def get_checksum_and_size(self, filepath_or_url):
res = self.s3.head_object(Bucket=self.bucket_name,
Key=path)
# strip off quotes and return md5
return 'etag', res['ETag'][1:-1], res['ContentLength']
etag = res['ETag'][1:-1]
kind = 's3etag' if '-' in etag else 'md5'
return kind, etag, res['ContentLength']
except Exception:
return None, None, None

0 comments on commit 3100054

Please sign in to comment.