diff --git a/src/azul/lib/strings.py b/src/azul/lib/strings.py index dc93a4d91c..3c004bee7e 100644 --- a/src/azul/lib/strings.py +++ b/src/azul/lib/strings.py @@ -1,3 +1,5 @@ +import hashlib +import re from textwrap import ( dedent, ) @@ -6,6 +8,7 @@ Sequence, overload, ) +import unicodedata from more_itertools import ( minmax, @@ -500,3 +503,157 @@ def _redact(secret: str, *, num_show: int = 3, mask='REDACTED'): back = reveal // 2 front = reveal - back return secret[:front] + mask + secret[n - back:] + + +def project_slug(title: str, + *, + words_left: int | None, + words_right: int | None, + word_length: int | None, + hash_length: int + ) -> str: + """ + Return a collision-resistant slug derived from a project / dataset title. + + >>> import functools + >>> slug = functools.partial(project_slug, + ... words_left=5, + ... words_right=2, + ... word_length=12, + ... hash_length=6) + + >>> slug('') + '' + + >>> slug('FooBarBaz') + 'foobarbaz--pxidyq' + + Words are split on space or underscore + + >>> slug('Foo Bar_Baz') + 'foo-bar-baz--uft5x2' + + Extra spaces / underscore are collapsed, however it is the full original + title that is used to calculate the hash. + + >>> slug('Foo Bar__Baz') + 'foo-bar-baz--li3m7h' + + Long titles are shortened based on `words_left` and `words_right` params + + >>> slug("aa bb cc dd ee ff gg hh ii jj kk ll mm") + 'aa-bb-cc-dd-ee--ll-mm--6jv2x9' + + Each word is truncated to `word_length` letters + + >>> slug('abcdefghijklmnopqrstuvwxyz 12345678901234567890') + 'abcdefghijkl-123456789012--genvsj' + + Non-alphanumeric characters are removed (comma, apostrophe, dash, etc.) + + >>> slug("The Hobbit, or There And Back Again A Hobbit's Tale") + 'the-hobbit-or-there-and--hobbits-tale--gmfr54' + + Accents are removed from accented characters + + >>> slug("Le Fantôme de l'Opéra") + 'le-fantome-de-lopera--r0ovr9' + + Non-latin letters are converted to a latin equivalent, symbols are removed + + >>> slug('α-waves β-blockers γ-rays ❤-healthy') + 'alphawaves-betablockers-gammarays-healthy--on4w79' + + `words_left`, `words_right`, or `hash_length` can be zero to omit that part + of the output + + >>> project_slug('one two three four five', + ... words_left=0, words_right=3, + ... word_length=4, hash_length=0) + 'thre-four-five' + + `word_length` cannot be zero ... + + >>> project_slug('FooBarBaz', + ... words_left=2, words_right=3, + ... word_length=0, hash_length=6) + Traceback (most recent call last): + ... + AssertionError: R('words_left must be 0 when word_length is 0', 2) + + ... unless `words_left` and `words_right` are also zero + + >>> project_slug('FooBarBaz', + ... words_left=0, words_right=0, + ... word_length=0, hash_length=6) + 'pxidyq' + + `None` can be used to indicate no limit on the number of words and/or length + of the words + + >>> project_slug('aa bb cccccccccccccccccccc dd ee ff gg hh ii jj', + ... words_left=None, words_right=None, + ... word_length=None, hash_length=4) + 'aa-bb-cccccccccccccccccccc-dd-ee-ff-gg-hh-ii-jj--xe1u' + """ + if title == '': + return title + + if word_length == 0: + assert words_left == 0, R( + 'words_left must be 0 when word_length is 0', words_left) + assert words_right == 0, R( + 'words_right must be 0 when word_length is 0', words_right) + assert hash_length > 0, R( + 'hash_length must be positive when word_length is 0', hash_length) + else: + assert word_length is None or word_length > 0, R( + 'word_length cannot be negative', word_length) + assert hash_length >= 0, R( + 'hash_length cannot be negative', hash_length) + if words_left is None or words_right is None: + assert words_left is None and words_right is None, R( + 'words_right and words_left must both be None if one is None', words_right) + else: + assert words_left >= 0, R( + 'words_left cannot be negative', words_left) + assert words_right >= 0, R( + 'words_right cannot be negative', words_right) + assert words_left != 0 or words_right != 0, R( + 'words_left and words_right cannot both be 0 when word_length is not 0', + words_left, words_right) + + chars = [] + for c in unicodedata.normalize('NFKD', title): + if not unicodedata.combining(c): + if unicodedata.category(c).startswith('L'): # L for a letter + name = unicodedata.name(c) + m = re.search(r'LETTER (\S+)', name) + if m: + c = m.group(1) + chars.append(c.lower()) + + words = re.split(r'[\s_]+', ''.join(chars)) + words = [re.sub(r'[^a-zA-Z0-9]', '', w) for w in words] + words = [w[:word_length] for w in words if w] + + parts = [] + if words_left is None or words_right is None or len(words) <= words_left + words_right: + parts.append('-'.join(words)) + else: + if words_left > 0: + parts.append('-'.join(words[0:words_left])) + if words_right > 0: + parts.append('-'.join(words[-words_right:])) + + if hash_length > 0: + alphabet = '0123456789abcdefghijklmnopqrstuvwxyz' + n = int.from_bytes(hashlib.sha1(title.encode()).digest()) + postfix = [] + for _ in range(hash_length): + n, r = divmod(n, len(alphabet)) + assert n > 0, R('hash_length exceeds digest entropy', hash_length) + postfix.append(alphabet[r]) + parts.append(''.join(postfix)) + + return '--'.join(filter(None, parts)) diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index ed3b95238a..48bd1caf2a 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -523,6 +523,14 @@ def special_fields(self) -> SpecialFields: """ raise NotImplementedError + @abstractmethod + def project_slug(self, document: JSON) -> str: + """ + A collision-resistant slug derived from the title of the project (HCA) + or dataset (AnVIL). + """ + raise NotImplementedError + @property def root_entity_type(self) -> EntityType: """ diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index e7fc27e12b..c4497d8cb5 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -31,6 +31,9 @@ FieldPathElement, IndexName, ) +from azul.lib import ( + strings, +) from azul.lib.digests import ( Digest, ) @@ -285,6 +288,18 @@ def _field_mapping(self) -> InverseFieldMapping: file_name=SpecialField(name='files.file_name', name_in_hit='file_name'), ) + def project_slug(self, document: JSON) -> str: + contents = json_mapping(document['contents']) + dataset = one(json_element_mappings(contents['datasets'])) + title = one(json_element_strings(dataset['title'])) + # AnVIL dataset titles are short enough that we don't need to limit the + # number of words or length of the words used in the slug. + return strings.project_slug(title, + words_left=None, + words_right=None, + word_length=None, + hash_length=6) + @property def root_entity_type(self) -> str: return 'datasets' diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py index cbb9c42449..4a3dd6b1e4 100644 --- a/src/azul/plugins/metadata/hca/__init__.py +++ b/src/azul/plugins/metadata/hca/__init__.py @@ -9,6 +9,9 @@ from attrs import ( frozen, ) +from more_itertools import ( + one, +) from azul import ( CatalogName, @@ -25,6 +28,7 @@ ) from azul.lib import ( R, + strings, ) from azul.lib.digests import ( Digest, @@ -37,6 +41,8 @@ MutableJSON, json_dict, json_dict_of_dicts, + json_element_mappings, + json_element_strings, json_int, json_list, json_mapping, @@ -318,6 +324,16 @@ def _field_mapping(self) -> InverseFieldMapping: file_name=SpecialField(name='fileName', name_in_hit='name') ) + def project_slug(self, document: JSON) -> str: + contents = json_mapping(document['contents']) + project = one(json_element_mappings(contents['projects'])) + title = one(json_element_strings(project['project_title'])) + return strings.project_slug(title, + words_left=5, + words_right=2, + word_length=12, + hash_length=6) + @property def root_entity_type(self) -> str: return 'projects' diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index c2a966bb2d..46d8bacbc4 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -1,4 +1,5 @@ from collections import ( + Counter, defaultdict, ) from collections.abc import ( @@ -219,6 +220,17 @@ def add_entity(self, metadata.update(drs_uri=drs_uri) target[entity] = metadata + def reject_duplicate_file_names(self) -> None: + file_names = Counter( + metadata['file_name'] + for entity, metadata in itertools.chain(self.entities.items(), + self.orphans.items()) + if entity.entity_type == 'anvil_file' + ) + duplicates = sorted(name for name, count in file_names.items() if count > 1) + assert not duplicates, R( + 'Bundle contains duplicate file names', self.fqid, duplicates) + def add_links(self, links: Iterable[EntityLink]): self.links.update(links) # Merge links that share the same (non-null) activity @@ -379,17 +391,19 @@ def missing_md5(row: BigQueryRow) -> bool: def _emulate_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: if bundle_fqid.table_name == BundleType.primary.value: log.info('Bundle %r is a primary bundle', bundle_fqid.uuid) - return self._primary_bundle(bundle_fqid) + bundle = self._primary_bundle(bundle_fqid) elif bundle_fqid.table_name == BundleType.supplementary.value: log.info('Bundle %r is a supplementary bundle', bundle_fqid.uuid) - return self._supplementary_bundle(bundle_fqid) + bundle = self._supplementary_bundle(bundle_fqid) elif bundle_fqid.table_name == BundleType.duos.value: assert config.duos_service_url is not None, bundle_fqid log.info('Bundle %r is a DUOS bundle', bundle_fqid.uuid) - return self._duos_bundle(bundle_fqid) + bundle = self._duos_bundle(bundle_fqid) else: log.info('Bundle %r is a replica bundle', bundle_fqid.uuid) - return self._replica_bundle(bundle_fqid) + bundle = self._replica_bundle(bundle_fqid) + bundle.reject_duplicate_file_names() + return bundle def _batch_tables(self, source: TDRSourceSpec, diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index c7e5df173c..5b0555782d 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -1570,7 +1570,7 @@ def write_page_to(self, output: IO[str] ) -> ManifestPartition: - def _write(file: JSON, is_related_file: bool = False): + def _write(file: JSON, slug: str, is_related_file: bool = False): special_fields = self.metadata_plugin.special_fields file_name_field = special_fields.file_name.name_in_hit file_uuid_field = special_fields.file_uuid.name_in_hit @@ -1600,7 +1600,7 @@ def _write(file: JSON, is_related_file: bool = False): # the one with the most recent version. bundle = max(json_element_mappings(doc['bundles']), key=itemgetter('version', 'uuid')) - output_name = json_str(bundle['uuid']) + '/' + file_name + output_name = slug + '/' + json_str(bundle['uuid']) + '/' + file_name output_name = self._sanitize_path(output_name) output.write(f'url={self._option(file_url)}\n' f'output={self._option(output_name)}\n\n') @@ -1628,6 +1628,7 @@ def _write(file: JSON, is_related_file: bool = False): hit = None for hit in response.hits: doc = self._hit_to_doc(hit) + slug = self.metadata_plugin.project_slug(doc) contents = json_mapping(doc['contents']) files = json_sequence(contents['files']) file = json_mapping(one(files)) @@ -1650,10 +1651,10 @@ def _write(file: JSON, is_related_file: bool = False): not config.is_anvil_enabled(self.catalog) or self.mirror_service.will_mirror(source.spec, json_int(file['file_size'])) ): - _write(file) + _write(file, slug) if config.is_hca_enabled(self.catalog): for related_file in json_element_mappings(file['related_files']): - _write(related_file, is_related_file=True) + _write(related_file, slug, is_related_file=True) assert hit is not None return partition.next_page(file_name=None, search_after=self._search_after(hit)) diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index 03e04c4675..34575221b8 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -36,6 +36,9 @@ DocumentType, EntityReference, ) +from azul.lib import ( + R, +) from azul.lib.types import ( JSONs, MutableJSONs, @@ -212,6 +215,25 @@ def test_absent_duos_id(self): self.assertEqual({}, bundle.entities) self.assertEqual(1, len(bundle.orphans)) + def test_reject_duplicate_file_names(self): + source_ref = self.source.ref + canned_file = self._load_canned_file_version(uuid=source_ref.id, + version=None, + extension='tables.tdr') + # Create a bundle with duplicated file names + file_name = 'dup-file-name-test.txt' + file_rows = canned_file['tables']['anvil_file']['rows'] + file_rows[0]['file_name'] = file_name + file_rows[1]['file_name'] = file_name + for name, table in canned_file['tables'].items(): + self._make_mock_table(source_ref.spec, name, table['rows'], table.get('schema')) + bundle_fqid = self.primary_bundle() + with self.assertRaises(AssertionError) as cm: + self.plugin.fetch_bundle(bundle_fqid) + self.assertTrue(R.caused(cm.exception)) + expected = ('Bundle contains duplicate file names', bundle_fqid, [file_name]) + self.assertEqual(expected, one(cm.exception.args).args) + class TestAnvilIndexerWithIndexesSetUp(AnvilIndexerTestCase): """ diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 160a08ada3..f49e62b29f 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -858,7 +858,9 @@ def test_manifest_zarr(self): response = self._get_manifest(format, filters) self.assertEqual(200, response.status_code) lines = response.content.decode().splitlines() - file_prefix = 'output="587d74b4-1075-4bbf-b96a-4d1ede0481b2/' + slug = 'q4-demosingle-cell-rnaseq-of--human-glioblastoma--udi8ee' + bundle_uuid = '587d74b4-1075-4bbf-b96a-4d1ede0481b2' + file_prefix = f'output="{slug}/{bundle_uuid}/' url = self.base_url.set(path='/repository/files') location_prefix = f'url="{str(url)}' curl_files = [] @@ -894,23 +896,24 @@ def test_curl_manifest(self): response = self._get_manifest(ManifestFormat.curl, filters) self.assertEqual(200, response.status_code) base_url = str(self.base_url.set(path='/repository/files')) + slug = 'melanoma-infiltration-of-stromal-and-immune-cells--6qb17r' expected_body = [ [ f'url="{base_url}/0db87826-ea2d-422b-ba71-b15d0e4293ae' '?catalog=test&version=2018-09-14T12%3A33%3A47.221025Z"', - 'output="f79257a7-dfc6-46d6-ae00-ba4b25313c10/SmartSeq2_sequencing_protocol.pdf"', + f'output="{slug}/{bundle_fqid.uuid}/SmartSeq2_sequencing_protocol.pdf"', '' ], [ f'url="{base_url}/156c15a3-3406-45d3-a25e-27179baf0c59' '?catalog=test&version=2018-09-14T12%3A33%3A46.866929Z"', - 'output="f79257a7-dfc6-46d6-ae00-ba4b25313c10/TissueDissociationProtocol.pdf"', + f'output="{slug}/{bundle_fqid.uuid}/TissueDissociationProtocol.pdf"', '' ], [ f'url="{base_url}/5f9b45af-9a26-4b16-a785-7f2d1053dd7c' '?catalog=test&version=2018-09-14T12%3A33%3A47.012715Z"', - 'output="f79257a7-dfc6-46d6-ae00-ba4b25313c10/SmartSeq2_RTPCR_protocol.pdf"', + f'output="{slug}/{bundle_fqid.uuid}/SmartSeq2_RTPCR_protocol.pdf"', '' ], ] @@ -1814,6 +1817,7 @@ def test_curl_manifest(self): file_size_2 = 213021639 file_size_3 = 3306845592 cases = [-1, file_size_1, file_size_2, file_size_3] + slug = 'anvil-cmg-uwash-ds-bdis--m3tsk6' for i, mirror_limit in enumerate(cases, start=1): with self.subTest(mirror_limit=mirror_limit): with self._patch_mirror_limit(self.catalog, mirror_limit): @@ -1826,21 +1830,21 @@ def test_curl_manifest(self): *iif(file_size_2 <= mirror_limit, [[ f'url="{base_url}/15b76f9c-6b46-433f-851d-34e89f1b9ba6' + '?catalog=test&version=2022-06-01T00%3A00%3A00.000000Z"', - 'output="826dea02-e274-affe-aabc-eb3db63ad068/' + + f'output="{slug}/826dea02-e274-affe-aabc-eb3db63ad068/' + '307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz"', '' ]]), *iif(file_size_3 <= mirror_limit, [[ f'url="{base_url}/3b17377b-16b1-431c-9967-e5d01fc5923f' + '?catalog=test&version=2022-06-01T00%3A00%3A00.000000Z"', - 'output="826dea02-e274-affe-aabc-eb3db63ad068/' + + f'output="{slug}/826dea02-e274-affe-aabc-eb3db63ad068/' + '307500.merged.matefixed.sorted.markeddups.recal.bam"', '' ]]), *iif(file_size_1 <= mirror_limit, [[ f'url="{base_url}/6b0f6c0f-5d80-4242-accb-840921351cd5' + '?catalog=test&version=2022-06-01T00%3A00%3A00.000000Z"', - 'output="595c469e-604d-ab34-af39-f5b9f5d61818/' + + f'output="{slug}/595c469e-604d-ab34-af39-f5b9f5d61818/' + 'CCDG_13607_B01_GRM_WGS_2019-02-19_chr15.recalibrated_variants.annotated.coding.txt"', '' ]])