Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 157 additions & 0 deletions src/azul/lib/strings.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import hashlib
import re
from textwrap import (
dedent,
)
Expand All @@ -6,6 +8,7 @@
Sequence,
overload,
)
import unicodedata

from more_itertools import (
minmax,
Expand Down Expand Up @@ -500,3 +503,157 @@ def _redact(secret: str, *, num_show: int = 3, mask='REDACTED'):
back = reveal // 2
front = reveal - back
return secret[:front] + mask + secret[n - back:]


def project_slug(title: str,
*,
words_left: int | None,
words_right: int | None,
word_length: int | None,
hash_length: int
) -> str:
"""
Return a collision-resistant slug derived from a project / dataset title.

>>> import functools
>>> slug = functools.partial(project_slug,
... words_left=5,
... words_right=2,
... word_length=12,
... hash_length=6)

>>> slug('')
''

>>> slug('FooBarBaz')
'foobarbaz--pxidyq'

Words are split on space or underscore

>>> slug('Foo Bar_Baz')
'foo-bar-baz--uft5x2'

Extra spaces / underscore are collapsed, however it is the full original
title that is used to calculate the hash.

>>> slug('Foo Bar__Baz')
'foo-bar-baz--li3m7h'

Long titles are shortened based on `words_left` and `words_right` params

>>> slug("aa bb cc dd ee ff gg hh ii jj kk ll mm")
'aa-bb-cc-dd-ee--ll-mm--6jv2x9'

Each word is truncated to `word_length` letters

>>> slug('abcdefghijklmnopqrstuvwxyz 12345678901234567890')
'abcdefghijkl-123456789012--genvsj'

Non-alphanumeric characters are removed (comma, apostrophe, dash, etc.)

>>> slug("The Hobbit, or There And Back Again A Hobbit's Tale")
'the-hobbit-or-there-and--hobbits-tale--gmfr54'

Accents are removed from accented characters

>>> slug("Le Fantôme de l'Opéra")
'le-fantome-de-lopera--r0ovr9'

Non-latin letters are converted to a latin equivalent, symbols are removed

>>> slug('α-waves β-blockers γ-rays ❤-healthy')
'alphawaves-betablockers-gammarays-healthy--on4w79'

`words_left`, `words_right`, or `hash_length` can be zero to omit that part
of the output

>>> project_slug('one two three four five',
... words_left=0, words_right=3,
... word_length=4, hash_length=0)
'thre-four-five'

`word_length` cannot be zero ...

>>> project_slug('FooBarBaz',
... words_left=2, words_right=3,
... word_length=0, hash_length=6)
Traceback (most recent call last):
...
AssertionError: R('words_left must be 0 when word_length is 0', 2)

... unless `words_left` and `words_right` are also zero

>>> project_slug('FooBarBaz',
... words_left=0, words_right=0,
... word_length=0, hash_length=6)
'pxidyq'

`None` can be used to indicate no limit on the number of words and/or length
of the words

>>> project_slug('aa bb cccccccccccccccccccc dd ee ff gg hh ii jj',
... words_left=None, words_right=None,
... word_length=None, hash_length=4)
'aa-bb-cccccccccccccccccccc-dd-ee-ff-gg-hh-ii-jj--xe1u'
"""
if title == '':
return title

if word_length == 0:
assert words_left == 0, R(
'words_left must be 0 when word_length is 0', words_left)
assert words_right == 0, R(
'words_right must be 0 when word_length is 0', words_right)
assert hash_length > 0, R(
'hash_length must be positive when word_length is 0', hash_length)
else:
assert word_length is None or word_length > 0, R(
'word_length cannot be negative', word_length)
assert hash_length >= 0, R(
'hash_length cannot be negative', hash_length)
if words_left is None or words_right is None:
assert words_left is None and words_right is None, R(
'words_right and words_left must both be None if one is None', words_right)
else:
assert words_left >= 0, R(
'words_left cannot be negative', words_left)
assert words_right >= 0, R(
'words_right cannot be negative', words_right)
assert words_left != 0 or words_right != 0, R(
'words_left and words_right cannot both be 0 when word_length is not 0',
words_left, words_right)

chars = []
for c in unicodedata.normalize('NFKD', title):
if not unicodedata.combining(c):
if unicodedata.category(c).startswith('L'): # L for a letter
name = unicodedata.name(c)
m = re.search(r'LETTER (\S+)', name)
if m:
c = m.group(1)
chars.append(c.lower())

words = re.split(r'[\s_]+', ''.join(chars))
words = [re.sub(r'[^a-zA-Z0-9]', '', w) for w in words]
words = [w[:word_length] for w in words if w]

parts = []
if words_left is None or words_right is None or len(words) <= words_left + words_right:
parts.append('-'.join(words))
else:
if words_left > 0:
parts.append('-'.join(words[0:words_left]))
if words_right > 0:
parts.append('-'.join(words[-words_right:]))

if hash_length > 0:
alphabet = '0123456789abcdefghijklmnopqrstuvwxyz'
n = int.from_bytes(hashlib.sha1(title.encode()).digest())
postfix = []
for _ in range(hash_length):
n, r = divmod(n, len(alphabet))
assert n > 0, R('hash_length exceeds digest entropy', hash_length)
postfix.append(alphabet[r])
parts.append(''.join(postfix))

return '--'.join(filter(None, parts))
8 changes: 8 additions & 0 deletions src/azul/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,14 @@ def special_fields(self) -> SpecialFields:
"""
raise NotImplementedError

@abstractmethod
def project_slug(self, document: JSON) -> str:
"""
A collision-resistant slug derived from the title of the project (HCA)
or dataset (AnVIL).
"""
raise NotImplementedError

@property
def root_entity_type(self) -> EntityType:
"""
Expand Down
15 changes: 15 additions & 0 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
FieldPathElement,
IndexName,
)
from azul.lib import (
strings,
)
from azul.lib.digests import (
Digest,
)
Expand Down Expand Up @@ -285,6 +288,18 @@ def _field_mapping(self) -> InverseFieldMapping:
file_name=SpecialField(name='files.file_name', name_in_hit='file_name'),
)

def project_slug(self, document: JSON) -> str:
contents = json_mapping(document['contents'])
dataset = one(json_element_mappings(contents['datasets']))
title = one(json_element_strings(dataset['title']))
# AnVIL dataset titles are short enough that we don't need to limit the
# number of words or length of the words used in the slug.
return strings.project_slug(title,
words_left=None,
words_right=None,
word_length=None,
hash_length=6)

@property
def root_entity_type(self) -> str:
return 'datasets'
Expand Down
16 changes: 16 additions & 0 deletions src/azul/plugins/metadata/hca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from attrs import (
frozen,
)
from more_itertools import (
one,
)

from azul import (
CatalogName,
Expand All @@ -25,6 +28,7 @@
)
from azul.lib import (
R,
strings,
)
from azul.lib.digests import (
Digest,
Expand All @@ -37,6 +41,8 @@
MutableJSON,
json_dict,
json_dict_of_dicts,
json_element_mappings,
json_element_strings,
json_int,
json_list,
json_mapping,
Expand Down Expand Up @@ -318,6 +324,16 @@ def _field_mapping(self) -> InverseFieldMapping:
file_name=SpecialField(name='fileName', name_in_hit='name')
)

def project_slug(self, document: JSON) -> str:
contents = json_mapping(document['contents'])
project = one(json_element_mappings(contents['projects']))
title = one(json_element_strings(project['project_title']))
return strings.project_slug(title,
words_left=5,
words_right=2,
word_length=12,
hash_length=6)

@property
def root_entity_type(self) -> str:
return 'projects'
Expand Down
22 changes: 18 additions & 4 deletions src/azul/plugins/repository/tdr_anvil/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import (
Counter,
defaultdict,
)
from collections.abc import (
Expand Down Expand Up @@ -219,6 +220,17 @@ def add_entity(self,
metadata.update(drs_uri=drs_uri)
target[entity] = metadata

def reject_duplicate_file_names(self) -> None:
file_names = Counter(
metadata['file_name']
for entity, metadata in itertools.chain(self.entities.items(),
self.orphans.items())
if entity.entity_type == 'anvil_file'
)
duplicates = sorted(name for name, count in file_names.items() if count > 1)
assert not duplicates, R(
'Bundle contains duplicate file names', self.fqid, duplicates)

def add_links(self, links: Iterable[EntityLink]):
self.links.update(links)
# Merge links that share the same (non-null) activity
Expand Down Expand Up @@ -379,17 +391,19 @@ def missing_md5(row: BigQueryRow) -> bool:
def _emulate_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle:
if bundle_fqid.table_name == BundleType.primary.value:
log.info('Bundle %r is a primary bundle', bundle_fqid.uuid)
return self._primary_bundle(bundle_fqid)
bundle = self._primary_bundle(bundle_fqid)
elif bundle_fqid.table_name == BundleType.supplementary.value:
log.info('Bundle %r is a supplementary bundle', bundle_fqid.uuid)
return self._supplementary_bundle(bundle_fqid)
bundle = self._supplementary_bundle(bundle_fqid)
elif bundle_fqid.table_name == BundleType.duos.value:
assert config.duos_service_url is not None, bundle_fqid
log.info('Bundle %r is a DUOS bundle', bundle_fqid.uuid)
return self._duos_bundle(bundle_fqid)
bundle = self._duos_bundle(bundle_fqid)
else:
log.info('Bundle %r is a replica bundle', bundle_fqid.uuid)
return self._replica_bundle(bundle_fqid)
bundle = self._replica_bundle(bundle_fqid)
bundle.reject_duplicate_file_names()
return bundle

def _batch_tables(self,
source: TDRSourceSpec,
Expand Down
9 changes: 5 additions & 4 deletions src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1570,7 +1570,7 @@ def write_page_to(self,
output: IO[str]
) -> ManifestPartition:

def _write(file: JSON, is_related_file: bool = False):
def _write(file: JSON, slug: str, is_related_file: bool = False):
special_fields = self.metadata_plugin.special_fields
file_name_field = special_fields.file_name.name_in_hit
file_uuid_field = special_fields.file_uuid.name_in_hit
Expand Down Expand Up @@ -1600,7 +1600,7 @@ def _write(file: JSON, is_related_file: bool = False):
# the one with the most recent version.
bundle = max(json_element_mappings(doc['bundles']),
key=itemgetter('version', 'uuid'))
output_name = json_str(bundle['uuid']) + '/' + file_name
output_name = slug + '/' + json_str(bundle['uuid']) + '/' + file_name
output_name = self._sanitize_path(output_name)
output.write(f'url={self._option(file_url)}\n'
f'output={self._option(output_name)}\n\n')
Expand Down Expand Up @@ -1628,6 +1628,7 @@ def _write(file: JSON, is_related_file: bool = False):
hit = None
for hit in response.hits:
doc = self._hit_to_doc(hit)
slug = self.metadata_plugin.project_slug(doc)
contents = json_mapping(doc['contents'])
files = json_sequence(contents['files'])
file = json_mapping(one(files))
Expand All @@ -1650,10 +1651,10 @@ def _write(file: JSON, is_related_file: bool = False):
not config.is_anvil_enabled(self.catalog)
or self.mirror_service.will_mirror(source.spec, json_int(file['file_size']))
):
_write(file)
_write(file, slug)
if config.is_hca_enabled(self.catalog):
for related_file in json_element_mappings(file['related_files']):
_write(related_file, is_related_file=True)
_write(related_file, slug, is_related_file=True)
assert hit is not None
return partition.next_page(file_name=None,
search_after=self._search_after(hit))
Expand Down
22 changes: 22 additions & 0 deletions test/indexer/test_anvil.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
DocumentType,
EntityReference,
)
from azul.lib import (
R,
)
from azul.lib.types import (
JSONs,
MutableJSONs,
Expand Down Expand Up @@ -212,6 +215,25 @@ def test_absent_duos_id(self):
self.assertEqual({}, bundle.entities)
self.assertEqual(1, len(bundle.orphans))

def test_reject_duplicate_file_names(self):
source_ref = self.source.ref
canned_file = self._load_canned_file_version(uuid=source_ref.id,
version=None,
extension='tables.tdr')
# Create a bundle with duplicated file names
file_name = 'dup-file-name-test.txt'
file_rows = canned_file['tables']['anvil_file']['rows']
file_rows[0]['file_name'] = file_name
file_rows[1]['file_name'] = file_name
for name, table in canned_file['tables'].items():
self._make_mock_table(source_ref.spec, name, table['rows'], table.get('schema'))
bundle_fqid = self.primary_bundle()
with self.assertRaises(AssertionError) as cm:
self.plugin.fetch_bundle(bundle_fqid)
self.assertTrue(R.caused(cm.exception))
expected = ('Bundle contains duplicate file names', bundle_fqid, [file_name])
self.assertEqual(expected, one(cm.exception.args).args)


class TestAnvilIndexerWithIndexesSetUp(AnvilIndexerTestCase):
"""
Expand Down
Loading
Loading