diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 7f61b515..86b4e7c1 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -15,11 +15,12 @@ from sqlalchemy.sql import func import datetime import uuid +import logging from ensembl.production.metadata.api.exceptions import MissingMetaException from ensembl.production.metadata.api.models.base import Base, LoadAble - +logger = logging.getLogger(__name__) class Attribute(LoadAble, Base): __tablename__ = 'attribute' @@ -35,6 +36,7 @@ class Attribute(LoadAble, Base): # many to one relationships # none + class Dataset(LoadAble, Base): __tablename__ = 'dataset' @@ -50,7 +52,8 @@ class Dataset(LoadAble, Base): # One to many relationships # dataset_id to dataset attribute and genome dataset - dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', cascade="all, delete, delete-orphan") + dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', + cascade="all, delete, delete-orphan") genome_datasets = relationship("GenomeDataset", back_populates='dataset', cascade="all, delete, delete-orphan") # many to one relationships # dataset_type_id to dataset_type @@ -66,12 +69,14 @@ def genebuild_version(self): return next( (att.value for att in self.dataset_attributes if att.attribute.name == 'genebuild.last_geneset_update'), - next((att.value for att in self.dataset_attributes if att.attribute.name == 'genebuild.start_date'), None)) + next((att.value for att in self.dataset_attributes if att.attribute.name == 'genebuild.start_date'), + None)) else: # return Related genebuild version logger.debug(F"Related datasets! : {self.genome_datasets.datasets}") genebuild_ds = next( - (dataset for dataset in self.genome_datasets.datasets if dataset.dataset_type.name == 'genebuild'), None) + (dataset for dataset in self.genome_datasets.datasets if dataset.dataset_type.name == 'genebuild'), + None) if genebuild_ds: return genebuild_ds.genebuild_version else: @@ -125,4 +130,3 @@ class DatasetType(LoadAble, Base): datasets = relationship('Dataset', back_populates='dataset_type') # many to one relationships # none - diff --git a/src/ensembl/production/metadata/api/models/genome.py b/src/ensembl/production/metadata/api/models/genome.py index 6660ae07..a59ca784 100644 --- a/src/ensembl/production/metadata/api/models/genome.py +++ b/src/ensembl/production/metadata/api/models/genome.py @@ -56,11 +56,9 @@ def get_public_path(self, dataset_type='all', release=None): da.attribute.name == "genebuild.annotation_source"), 'ensembl') # Genebuild version is either the laste_geneset_update or the start_date if not specified. - genebuild_version = next( - (da.value for da in genebuild_dataset.dataset_attributes if - da.attribute.name == "genebuild.version"), genebuild_dataset.version) try: - genebuild_version = re.sub(r"[^\w\s]", '', re.sub(r"\s+", '_', genebuild_version)) + match = re.match(r'^(\d{4}-\d{2})', genebuild_dataset.genebuild_version) + genebuild_version = match.group(1).replace('-', '_') except TypeError as e: logger.fatal(f"For genome {self.genome_uuid}, can't find genebuild_version directory") raise RuntimeError(e) diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_attribute.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_attribute.txt index 99ac317f..2a8fc9e8 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_attribute.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_attribute.txt @@ -12587,7 +12587,7 @@ 14999 6600 30 402 14965 1.00 31 402 14984 2018-10 33 402 -14985 2018-10 34 402 +14985 2018-10-Ensembl 34 402 15010 toplevel 35 402 14980 14733 36 402 15008 import 37 402 diff --git a/src/tests/test_api.py b/src/tests/test_api.py index 46d09180..5912fea2 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -38,13 +38,13 @@ def test_get_public_path(self, multi_dbs): assert len(paths) == 4 # assert all("/genebuild/" in path for path in paths) path = genome.get_public_path(dataset_type='genebuild') - assert path[0]['path'] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/ensembl/genebuild/EXT01' + assert path[0]['path'] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/ensembl/genebuild/2018_10' path = genome.get_public_path(dataset_type='assembly') assert path[0]['path'] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/ensembl/genome' path = genome.get_public_path(dataset_type='variation') - assert path[0]['path'] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/ensembl/variation/EXT01' + assert path[0]['path'] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/ensembl/variation/2018_10' path = genome.get_public_path(dataset_type='homologies') - assert path[0]['path'] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/ensembl/homology/EXT01' + assert path[0]['path'] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/ensembl/homology/2018_10' with pytest.raises(TypeNotFoundException): genome.get_public_path(dataset_type='regulatory_features') # assert path[0]['path'] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/ensembl/regulation' @@ -58,13 +58,13 @@ def test_default_public_path(self, multi_dbs): assert len(paths) == 5 # assert all("/genebuild/" in path for path in paths) path = genome.get_public_path(dataset_type='genebuild') - assert path[0]['path'] == 'Homo_sapiens/GCA_000001405.29/ensembl/genebuild/GENCODE44' + assert path[0]['path'] == 'Homo_sapiens/GCA_000001405.29/ensembl/genebuild/2023_03' path = genome.get_public_path(dataset_type='assembly') assert path[0]['path'] == 'Homo_sapiens/GCA_000001405.29/ensembl/genome' path = genome.get_public_path(dataset_type='variation') - assert path[0]['path'] == 'Homo_sapiens/GCA_000001405.29/ensembl/variation/GENCODE44' + assert path[0]['path'] == 'Homo_sapiens/GCA_000001405.29/ensembl/variation/2023_03' path = genome.get_public_path(dataset_type='homologies') - assert path[0]['path'] == 'Homo_sapiens/GCA_000001405.29/ensembl/homology/GENCODE44' + assert path[0]['path'] == 'Homo_sapiens/GCA_000001405.29/ensembl/homology/2023_03' path = genome.get_public_path(dataset_type='regulatory_features') assert path[0]['path'] == 'Homo_sapiens/GCA_000001405.29/ensembl/regulation'