DataBiosphere · dsotirho-ucsc · May 28, 2026 · May 27, 2026
@@ -1,3 +1,5 @@
+import hashlib
+import re
 from textwrap import (
     dedent,
 )
@@ -6,6 +8,7 @@
     Sequence,
     overload,
 )
+import unicodedata
 
 from more_itertools import (
     minmax,
@@ -500,3 +503,157 @@ def _redact(secret: str, *, num_show: int = 3, mask='REDACTED'):
     back = reveal // 2
     front = reveal - back
     return secret[:front] + mask + secret[n - back:]
+
+
+def project_slug(title: str,
+                 *,
+                 words_left: int | None,
+                 words_right: int | None,
+                 word_length: int | None,
+                 hash_length: int
+                 ) -> str:
+    """
+    Return a collision-resistant slug derived from a project / dataset title.
+
+    >>> import functools
+    >>> slug = functools.partial(project_slug,
+    ...                          words_left=5,
+    ...                          words_right=2,
+    ...                          word_length=12,
+    ...                          hash_length=6)
+
+    >>> slug('')
+    ''
+
+    >>> slug('FooBarBaz')
+    'foobarbaz--pxidyq'
+
+    Words are split on space or underscore
+
+    >>> slug('Foo Bar_Baz')
+    'foo-bar-baz--uft5x2'
+
+    Extra spaces / underscore are collapsed, however it is the full original
+    title that is used to calculate the hash.
+
+    >>> slug('Foo  Bar__Baz')
+    'foo-bar-baz--li3m7h'
+
+    Long titles are shortened based on `words_left` and `words_right` params
+
+    >>> slug("aa bb cc dd ee ff gg hh ii jj kk ll mm")
+    'aa-bb-cc-dd-ee--ll-mm--6jv2x9'
+
+    Each word is truncated to `word_length` letters
+
+    >>> slug('abcdefghijklmnopqrstuvwxyz 12345678901234567890')
+    'abcdefghijkl-123456789012--genvsj'
+
+    Non-alphanumeric characters are removed (comma, apostrophe, dash, etc.)
+
+    >>> slug("The Hobbit, or There And Back Again A Hobbit's Tale")
+    'the-hobbit-or-there-and--hobbits-tale--gmfr54'
+
+    Accents are removed from accented characters
+
+    >>> slug("Le Fantôme de l'Opéra")
+    'le-fantome-de-lopera--r0ovr9'
+
+    Non-latin letters are converted to a latin equivalent, symbols are removed
+
+    >>> slug('α-waves β-blockers γ-rays ❤-healthy')
+    'alphawaves-betablockers-gammarays-healthy--on4w79'
+
+    `words_left`, `words_right`, or `hash_length` can be zero to omit that part
+    of the output
+
+    >>> project_slug('one two three four five',
+    ...              words_left=0, words_right=3,
+    ...              word_length=4, hash_length=0)
+    'thre-four-five'
+
+    `word_length` cannot be zero ...
+
+    >>> project_slug('FooBarBaz',
+    ...              words_left=2, words_right=3,
+    ...              word_length=0, hash_length=6)
+    Traceback (most recent call last):
+    ...
+    AssertionError: R('words_left must be 0 when word_length is 0', 2)
+
+    ... unless `words_left` and `words_right` are also zero
+
+    >>> project_slug('FooBarBaz',
+    ...              words_left=0, words_right=0,
+    ...              word_length=0, hash_length=6)
+    'pxidyq'
+
+    `None` can be used to indicate no limit on the number of words and/or length
+    of the words
+
+    >>> project_slug('aa bb cccccccccccccccccccc dd ee ff gg hh ii jj',
+    ...              words_left=None, words_right=None,
+    ...              word_length=None, hash_length=4)
+    'aa-bb-cccccccccccccccccccc-dd-ee-ff-gg-hh-ii-jj--xe1u'
+    """
+    if title == '':
+        return title
+
+    if word_length == 0:
+        assert words_left == 0, R(
+            'words_left must be 0 when word_length is 0', words_left)
+        assert words_right == 0, R(
+            'words_right must be 0 when word_length is 0', words_right)
+        assert hash_length > 0, R(
+            'hash_length must be positive when word_length is 0', hash_length)
+    else:
+        assert word_length is None or word_length > 0, R(
+            'word_length cannot be negative', word_length)
+        assert hash_length >= 0, R(
+            'hash_length cannot be negative', hash_length)
+        if words_left is None or words_right is None:
+            assert words_left is None and words_right is None, R(
+                'words_right and words_left must both be None if one is None', words_right)
+        else:
+            assert words_left >= 0, R(
+                'words_left cannot be negative', words_left)
+            assert words_right >= 0, R(
+                'words_right cannot be negative', words_right)
+            assert words_left != 0 or words_right != 0, R(
+                'words_left and words_right cannot both be 0 when word_length is not 0',
+                words_left, words_right)
+
+    chars = []
+    for c in unicodedata.normalize('NFKD', title):
+        if not unicodedata.combining(c):
+            if unicodedata.category(c).startswith('L'):  # L for a letter
+                name = unicodedata.name(c)
+                m = re.search(r'LETTER (\S+)', name)
+                if m:
+                    c = m.group(1)
+            chars.append(c.lower())
+
+    words = re.split(r'[\s_]+', ''.join(chars))
+    words = [re.sub(r'[^a-zA-Z0-9]', '', w) for w in words]
+    words = [w[:word_length] for w in words if w]
+
+    parts = []
+    if words_left is None or words_right is None or len(words) <= words_left + words_right:
+        parts.append('-'.join(words))
+    else:
+        if words_left > 0:
+            parts.append('-'.join(words[0:words_left]))
+        if words_right > 0:
+            parts.append('-'.join(words[-words_right:]))
+
+    if hash_length > 0:
+        alphabet = '0123456789abcdefghijklmnopqrstuvwxyz'
+        n = int.from_bytes(hashlib.sha1(title.encode()).digest())
+        postfix = []
+        for _ in range(hash_length):
+            n, r = divmod(n, len(alphabet))
+            assert n > 0, R('hash_length exceeds digest entropy', hash_length)
+            postfix.append(alphabet[r])
+        parts.append(''.join(postfix))
+
+    return '--'.join(filter(None, parts))
@@ -523,6 +523,14 @@ def special_fields(self) -> SpecialFields:
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def project_slug(self, document: JSON) -> str:
+        """
+        A collision-resistant slug derived from the title of the project (HCA)
+        or dataset (AnVIL).
+        """
+        raise NotImplementedError
+
     @property
     def root_entity_type(self) -> EntityType:
         """

@@ -31,6 +31,9 @@
     FieldPathElement,
     IndexName,
 )
+from azul.lib import (
+    strings,
+)
 from azul.lib.digests import (
     Digest,
 )
@@ -285,6 +288,18 @@ def _field_mapping(self) -> InverseFieldMapping:
         file_name=SpecialField(name='files.file_name', name_in_hit='file_name'),
     )
 
+    def project_slug(self, document: JSON) -> str:
+        contents = json_mapping(document['contents'])
+        dataset = one(json_element_mappings(contents['datasets']))
+        title = one(json_element_strings(dataset['title']))
+        # AnVIL dataset titles are short enough that we don't need to limit the
+        # number of words or length of the words used in the slug.
+        return strings.project_slug(title,
+                                    words_left=None,
+                                    words_right=None,
+                                    word_length=None,
+                                    hash_length=6)
+
     @property
     def root_entity_type(self) -> str:
         return 'datasets'

@@ -9,6 +9,9 @@
 from attrs import (
     frozen,
 )
+from more_itertools import (
+    one,
+)
 
 from azul import (
     CatalogName,
@@ -25,6 +28,7 @@
 )
 from azul.lib import (
     R,
+    strings,
 )
 from azul.lib.digests import (
     Digest,
@@ -37,6 +41,8 @@
     MutableJSON,
     json_dict,
     json_dict_of_dicts,
+    json_element_mappings,
+    json_element_strings,
     json_int,
     json_list,
     json_mapping,
@@ -318,6 +324,16 @@ def _field_mapping(self) -> InverseFieldMapping:
         file_name=SpecialField(name='fileName', name_in_hit='name')
     )
 
+    def project_slug(self, document: JSON) -> str:
+        contents = json_mapping(document['contents'])
+        project = one(json_element_mappings(contents['projects']))
+        title = one(json_element_strings(project['project_title']))
+        return strings.project_slug(title,
+                                    words_left=5,
+                                    words_right=2,
+                                    word_length=12,
+                                    hash_length=6)
+
     @property
     def root_entity_type(self) -> str:
         return 'projects'

@@ -1,4 +1,5 @@
 from collections import (
+    Counter,
     defaultdict,
 )
 from collections.abc import (
@@ -219,6 +220,17 @@ def add_entity(self,
             metadata.update(drs_uri=drs_uri)
         target[entity] = metadata
 
+    def reject_duplicate_file_names(self) -> None:
+        file_names = Counter(
+            metadata['file_name']
+            for entity, metadata in itertools.chain(self.entities.items(),
+                                                    self.orphans.items())
+            if entity.entity_type == 'anvil_file'
+        )
+        duplicates = sorted(name for name, count in file_names.items() if count > 1)
+        assert not duplicates, R(
+            'Bundle contains duplicate file names', self.fqid, duplicates)
+
     def add_links(self, links: Iterable[EntityLink]):
         self.links.update(links)
         # Merge links that share the same (non-null) activity
@@ -379,17 +391,19 @@ def missing_md5(row: BigQueryRow) -> bool:
     def _emulate_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle:
         if bundle_fqid.table_name == BundleType.primary.value:
             log.info('Bundle %r is a primary bundle', bundle_fqid.uuid)
-            return self._primary_bundle(bundle_fqid)
+            bundle = self._primary_bundle(bundle_fqid)
         elif bundle_fqid.table_name == BundleType.supplementary.value:
             log.info('Bundle %r is a supplementary bundle', bundle_fqid.uuid)
-            return self._supplementary_bundle(bundle_fqid)
+            bundle = self._supplementary_bundle(bundle_fqid)
         elif bundle_fqid.table_name == BundleType.duos.value:
             assert config.duos_service_url is not None, bundle_fqid
             log.info('Bundle %r is a DUOS bundle', bundle_fqid.uuid)
-            return self._duos_bundle(bundle_fqid)
+            bundle = self._duos_bundle(bundle_fqid)
         else:
             log.info('Bundle %r is a replica bundle', bundle_fqid.uuid)
-            return self._replica_bundle(bundle_fqid)
+            bundle = self._replica_bundle(bundle_fqid)
+        bundle.reject_duplicate_file_names()
+        return bundle
 
     def _batch_tables(self,
                       source: TDRSourceSpec,

@@ -1570,7 +1570,7 @@ def write_page_to(self,
                       output: IO[str]
                       ) -> ManifestPartition:
 
-        def _write(file: JSON, is_related_file: bool = False):
+        def _write(file: JSON, slug: str, is_related_file: bool = False):
             special_fields = self.metadata_plugin.special_fields
             file_name_field = special_fields.file_name.name_in_hit
             file_uuid_field = special_fields.file_uuid.name_in_hit
@@ -1600,7 +1600,7 @@ def _write(file: JSON, is_related_file: bool = False):
                 # the one with the most recent version.
                 bundle = max(json_element_mappings(doc['bundles']),
                              key=itemgetter('version', 'uuid'))
-                output_name = json_str(bundle['uuid']) + '/' + file_name
+                output_name = slug + '/' + json_str(bundle['uuid']) + '/' + file_name
                 output_name = self._sanitize_path(output_name)
                 output.write(f'url={self._option(file_url)}\n'
                              f'output={self._option(output_name)}\n\n')
@@ -1628,6 +1628,7 @@ def _write(file: JSON, is_related_file: bool = False):
             hit = None
             for hit in response.hits:
                 doc = self._hit_to_doc(hit)
+                slug = self.metadata_plugin.project_slug(doc)
                 contents = json_mapping(doc['contents'])
                 files = json_sequence(contents['files'])
                 file = json_mapping(one(files))
@@ -1650,10 +1651,10 @@ def _write(file: JSON, is_related_file: bool = False):
                     not config.is_anvil_enabled(self.catalog)
                     or self.mirror_service.will_mirror(source.spec, json_int(file['file_size']))
                 ):
-                    _write(file)
+                    _write(file, slug)
                     if config.is_hca_enabled(self.catalog):
                         for related_file in json_element_mappings(file['related_files']):
-                            _write(related_file, is_related_file=True)
+                            _write(related_file, slug, is_related_file=True)
             assert hit is not None
             return partition.next_page(file_name=None,
                                        search_after=self._search_after(hit))

@@ -36,6 +36,9 @@
     DocumentType,
     EntityReference,
 )
+from azul.lib import (
+    R,
+)
 from azul.lib.types import (
     JSONs,
     MutableJSONs,
@@ -212,6 +215,25 @@ def test_absent_duos_id(self):
                     self.assertEqual({}, bundle.entities)
                     self.assertEqual(1, len(bundle.orphans))
 
+    def test_reject_duplicate_file_names(self):
+        source_ref = self.source.ref
+        canned_file = self._load_canned_file_version(uuid=source_ref.id,
+                                                     version=None,
+                                                     extension='tables.tdr')
+        # Create a bundle with duplicated file names
+        file_name = 'dup-file-name-test.txt'
+        file_rows = canned_file['tables']['anvil_file']['rows']
+        file_rows[0]['file_name'] = file_name
+        file_rows[1]['file_name'] = file_name
+        for name, table in canned_file['tables'].items():
+            self._make_mock_table(source_ref.spec, name, table['rows'], table.get('schema'))
+        bundle_fqid = self.primary_bundle()
+        with self.assertRaises(AssertionError) as cm:
+            self.plugin.fetch_bundle(bundle_fqid)
+        self.assertTrue(R.caused(cm.exception))
+        expected = ('Bundle contains duplicate file names', bundle_fqid, [file_name])
+        self.assertEqual(expected, one(cm.exception.args).args)
+
 
 class TestAnvilIndexerWithIndexesSetUp(AnvilIndexerTestCase):
     """