From b352040cb02861b8baee13868282084a01d11e11 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 12 Dec 2023 11:00:40 +0000 Subject: [PATCH 1/4] cleaned exceptions.py --- src/ensembl/production/metadata/api/exceptions.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/ensembl/production/metadata/api/exceptions.py b/src/ensembl/production/metadata/api/exceptions.py index e5b9eb7f..647188be 100644 --- a/src/ensembl/production/metadata/api/exceptions.py +++ b/src/ensembl/production/metadata/api/exceptions.py @@ -9,9 +9,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import sqlalchemy as db -from sqlalchemy import inspect -from sqlalchemy.engine import make_url class MetaException(Exception): From a27be107cd6bf080692a789c145039dc700cca6e Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 16 Jan 2024 10:45:09 +0000 Subject: [PATCH 2/4] Added two criteria for new genome genebuild.start_date and genebuild.provider_name in addition to genebuild.version --- .../production/metadata/updater/core.py | 26 ++++++++++++++++++- src/tests/databases/core_1/meta.txt | 4 ++- src/tests/databases/core_2/meta.txt | 4 ++- src/tests/databases/core_3/meta.txt | 4 ++- src/tests/databases/core_4/meta.txt | 4 ++- src/tests/databases/core_5/meta.txt | 2 ++ src/tests/databases/core_6/meta.txt | 4 ++- src/tests/databases/core_7/meta.txt | 4 ++- src/tests/databases/core_8/meta.txt | 4 ++- src/tests/databases/core_9/meta.txt | 4 ++- 10 files changed, 51 insertions(+), 9 deletions(-) diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index f431038b..2e8657d1 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -461,7 +461,7 @@ def get_or_new_assembly(self, species_id, meta_session, source=None, existing=No assembly_default=self.get_meta_single_meta_key(species_id, "assembly.default"), tol_id=tol_id, created=func.now(), - ensembl_name=self.get_meta_single_meta_key(species_id, "assembly.name"), + # ensembl_name=self.get_meta_single_meta_key(species_id, "assembly.name"), assembly_uuid=str(uuid.uuid4()), url_name=self.get_meta_single_meta_key(species_id, "assembly.url_name"), is_reference=is_reference @@ -540,7 +540,31 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F dataset_source = source dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "genebuild").first() + + genebuild_start_date = self.get_meta_single_meta_key(species_id, "genebuild.start_date") + genebuild_provider_name = self.get_meta_single_meta_key(species_id, "genebuild.provider_name") + test_status = meta_session.query(Dataset).filter(Dataset.label == genebuild_accession).one_or_none() + if test_status: + # Check for genebuild.provider_name + provider_name_check = meta_session.query(DatasetAttribute).join(Attribute).filter( + DatasetAttribute.dataset_id == test_status.dataset_id, + Attribute.name == "genebuild.provider_name", + DatasetAttribute.value == genebuild_provider_name + ).one_or_none() + + if provider_name_check: + # Check for genebuild.start_date + start_date_check = meta_session.query(DatasetAttribute).join(Attribute).filter( + DatasetAttribute.dataset_id == test_status.dataset_id, + Attribute.name == "genebuild.start_date", + DatasetAttribute.value == genebuild_start_date + ).one_or_none() + + if start_date_check is None: + test_status = None + + if test_status is not None and existing is False: genebuild_dataset = test_status genebuild_dataset_attributes = genebuild_dataset.dataset_attributes diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt index b2afd577..d79088ef 100644 --- a/src/tests/databases/core_1/meta.txt +++ b/src/tests/databases/core_1/meta.txt @@ -19,4 +19,6 @@ 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test 21 1 assembly.test_value test -22 1 genebuild.test_value test \ No newline at end of file +22 1 genebuild.test_value test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt index d00958db..deadc76d 100644 --- a/src/tests/databases/core_2/meta.txt +++ b/src/tests/databases/core_2/meta.txt @@ -18,4 +18,6 @@ 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test -21 1 genome.genome_uuid test \ No newline at end of file +21 1 genome.genome_uuid test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt index fd2c682e..2ddb2633 100644 --- a/src/tests/databases/core_3/meta.txt +++ b/src/tests/databases/core_3/meta.txt @@ -16,4 +16,6 @@ 17 1 genebuild.version 1 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 -20 1 strain.type test \ No newline at end of file +20 1 strain.type test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index 990e4b74..75ed9796 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -17,4 +17,6 @@ 17 1 genebuild.version 2 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 -20 1 strain.type test \ No newline at end of file +20 1 strain.type test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_5/meta.txt b/src/tests/databases/core_5/meta.txt index 768e54bc..f7f399ce 100644 --- a/src/tests/databases/core_5/meta.txt +++ b/src/tests/databases/core_5/meta.txt @@ -14,3 +14,5 @@ 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_6/meta.txt b/src/tests/databases/core_6/meta.txt index a8a42b8a..76ac4c94 100644 --- a/src/tests/databases/core_6/meta.txt +++ b/src/tests/databases/core_6/meta.txt @@ -18,4 +18,6 @@ 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test -21 1 genome.genome_uuid 90720316-006c-470b-a7dd-82d28f952264 \ No newline at end of file +21 1 genome.genome_uuid 90720316-006c-470b-a7dd-82d28f952264 +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_7/meta.txt b/src/tests/databases/core_7/meta.txt index b3733d86..ccbdeddc 100644 --- a/src/tests/databases/core_7/meta.txt +++ b/src/tests/databases/core_7/meta.txt @@ -19,4 +19,6 @@ 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test 21 1 assembly.test_value test2 -22 1 genebuild.test_value test2 \ No newline at end of file +22 1 genebuild.test_value test2 +23 1 genebuild.provider_name test +24 1 genebuild.start_date test diff --git a/src/tests/databases/core_8/meta.txt b/src/tests/databases/core_8/meta.txt index 3955b30e..9be1fae6 100644 --- a/src/tests/databases/core_8/meta.txt +++ b/src/tests/databases/core_8/meta.txt @@ -19,4 +19,6 @@ 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test 21 1 assembly.test_value test -22 1 genebuild.test_value test \ No newline at end of file +22 1 genebuild.test_value test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_9/meta.txt b/src/tests/databases/core_9/meta.txt index 10235dab..7d38e021 100644 --- a/src/tests/databases/core_9/meta.txt +++ b/src/tests/databases/core_9/meta.txt @@ -20,4 +20,6 @@ 20 1 strain.type test 21 1 assembly.test_value test3 22 1 genebuild.test_value test3 -23 1 genome.genome_uuid a733550b-93e7-11ec-a39d-005056b38ce3 \ No newline at end of file +23 1 genome.genome_uuid a733550b-93e7-11ec-a39d-005056b38ce3 +24 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file From c1b45b1a361430b4c652dc191f73005c8ced85c0 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 16 Jan 2024 12:02:55 +0000 Subject: [PATCH 3/4] Added type checking to get_public_path --- .../production/metadata/api/models/genome.py | 39 +++++++++++++++---- .../api/sample/ensembl_metadata/dataset.txt | 3 +- .../ensembl_metadata/genome_dataset.txt | 1 + src/tests/test_api.py | 4 +- 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/ensembl/production/metadata/api/models/genome.py b/src/ensembl/production/metadata/api/models/genome.py index 4c52cf7e..c7066491 100644 --- a/src/ensembl/production/metadata/api/models/genome.py +++ b/src/ensembl/production/metadata/api/models/genome.py @@ -54,18 +54,41 @@ def get_public_path(self, type='all', release=None): genebuild_source_name = genebuild_annotation_source_attribute.value common_path = f"{self.organism.scientific_name.replace(' ', '_')}/{self.assembly.accession}/{genebuild_source_name}" - - if type in ['genebuild', 'assembly', 'homology', 'regulation', 'variation', 'all']: + unique_dataset_types = {gd.dataset.dataset_type.name for gd in self.genome_datasets} + + standard_types = {'genebuild', 'assembly', 'homologies', 'regulation_build', 'regulatory_features', 'variation'} + types_available = unique_dataset_types.intersection(standard_types) + if 'regulatory_features' in types_available or 'regulation_build' in types_available: + types_available.discard('regulatory_features') + types_available.discard('regulation_build') + types_available.add('regulation') + if 'regulatory_features' == type or 'regulation_build' == type: + type = 'regulation' + + if type in types_available or type == 'all': if type == 'genebuild': paths.append(f"{common_path}/genebuild/{genebuild_dataset.dataset.version}") elif type == 'assembly': paths.append(f"{common_path}/genome") - elif type in ['homology', 'regulation', 'variation']: - paths.append(f"{common_path}/{type}") - elif type == 'all': - # Add paths for all types - for t in ['genebuild', 'assembly', 'homology', 'regulation', 'variation']: - paths.extend(self.get_public_path(type=t)) + elif type == 'homologies': + paths.append(f"{common_path}/homology") + elif type == 'regulation': + paths.append(f"{common_path}/regulation") + elif type == 'variation': + paths.append(f"{common_path}/variation") + + + if type == 'all': + for t in types_available: + if t == 'genebuild': + paths.append(f"{common_path}/genebuild/{genebuild_dataset.dataset.version}") + elif t == 'assembly': + paths.append(f"{common_path}/genome") + elif t == 'homologies': + paths.append(f"{common_path}/homology") + elif t in ['regulation', 'variation']: + paths.append(f"{common_path}/{t}") + return paths diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt index e4e6c24e..f4794aee 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt @@ -28,4 +28,5 @@ 36 1068ba70-0088-4927-98bd-8fabcfb9a384 4 evidence \N 2023-06-02 13:32:52 10 Manual Add Submitted 38 47d54c33-80d6-4174-8620-52b6c8506db2 6 homologies \N 2023-06-02 13:32:52 11 Manual Add Submitted 42 ea044d8e-33f1-4c9f-9b9f-8c0bd1dcf642 6 homologies \N 2023-06-02 13:32:52 11 Manual Add Submitted -46 385f1ec2-bd06-40ce-873a-98e199f10505 1 asssembly \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted \ No newline at end of file +46 385f1ec2-bd06-40ce-873a-98e199f10505 1 asssembly \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted +47 385f1ec2-bd06-40ce-873a-98e199f10534 5 regulation_build \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt index 49ae11da..b7bd924e 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt @@ -31,3 +31,4 @@ 57 44 7 \N 0 58 45 7 \N 0 59 46 9 \N 0 +60 47 6 1 1 diff --git a/src/tests/test_api.py b/src/tests/test_api.py index ce50c1a3..1b3b2404 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -48,9 +48,9 @@ def test_get_public_path_genebuild(self, multi_dbs): assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/genome' path = genome.get_public_path(type='variation') assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/variation' - path = genome.get_public_path(type='homology') + path = genome.get_public_path(type='homologies') assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/homology' - path = genome.get_public_path(type='regulation') + path = genome.get_public_path(type='regulatory_features') assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/regulation' def test_organism_ensembl_name_compat(self, multi_dbs): From c819f06b4db021a275472924ee63db29ead6ce51 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 16 Jan 2024 13:39:19 +0000 Subject: [PATCH 4/4] Cleanup as per PR --- .../production/metadata/api/exceptions.py | 4 +++ .../production/metadata/api/models/genome.py | 28 +++++++++---------- .../production/metadata/updater/core.py | 1 - 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/ensembl/production/metadata/api/exceptions.py b/src/ensembl/production/metadata/api/exceptions.py index 647188be..4cff5b80 100644 --- a/src/ensembl/production/metadata/api/exceptions.py +++ b/src/ensembl/production/metadata/api/exceptions.py @@ -44,3 +44,7 @@ class MissingMetaException(MetaException, RuntimeError): class UpdateBackCoreException(UpdaterException, RuntimeError): """An error occurred while updating back the core database""" pass + +class TypeNotFoundException(UpdaterException, RuntimeError): + """Dataset Type not found""" + pass \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/models/genome.py b/src/ensembl/production/metadata/api/models/genome.py index c7066491..4e839445 100644 --- a/src/ensembl/production/metadata/api/models/genome.py +++ b/src/ensembl/production/metadata/api/models/genome.py @@ -14,7 +14,7 @@ from sqlalchemy import Column, Integer, String, ForeignKey from sqlalchemy.dialects.mysql import DATETIME, TINYINT from sqlalchemy.orm import relationship - +from ensembl.production.metadata.api.exceptions import * from ensembl.production.metadata.api.models.base import Base, LoadAble @@ -56,16 +56,14 @@ def get_public_path(self, type='all', release=None): common_path = f"{self.organism.scientific_name.replace(' ', '_')}/{self.assembly.accession}/{genebuild_source_name}" unique_dataset_types = {gd.dataset.dataset_type.name for gd in self.genome_datasets} - standard_types = {'genebuild', 'assembly', 'homologies', 'regulation_build', 'regulatory_features', 'variation'} - types_available = unique_dataset_types.intersection(standard_types) - if 'regulatory_features' in types_available or 'regulation_build' in types_available: - types_available.discard('regulatory_features') - types_available.discard('regulation_build') - types_available.add('regulation') + if 'regulatory_features' in unique_dataset_types or 'regulation_build' in unique_dataset_types: + unique_dataset_types.discard('regulatory_features') + unique_dataset_types.discard('regulation_build') + unique_dataset_types.add('regulation') if 'regulatory_features' == type or 'regulation_build' == type: type = 'regulation' - if type in types_available or type == 'all': + if type in unique_dataset_types or type == 'all': if type == 'genebuild': paths.append(f"{common_path}/genebuild/{genebuild_dataset.dataset.version}") elif type == 'assembly': @@ -76,10 +74,8 @@ def get_public_path(self, type='all', release=None): paths.append(f"{common_path}/regulation") elif type == 'variation': paths.append(f"{common_path}/variation") - - - if type == 'all': - for t in types_available: + elif type == 'all': + for t in unique_dataset_types: if t == 'genebuild': paths.append(f"{common_path}/genebuild/{genebuild_dataset.dataset.version}") elif t == 'assembly': @@ -88,9 +84,11 @@ def get_public_path(self, type='all', release=None): paths.append(f"{common_path}/homology") elif t in ['regulation', 'variation']: paths.append(f"{common_path}/{t}") - - return paths - + else: + raise TypeNotFoundException(f"Dataset Type : {type} has no associated path. ") + return paths + else: + raise TypeNotFoundException(f"Dataset Type : {type} not found in metadata. ") class GenomeDataset(LoadAble, Base): diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 2e8657d1..6573a348 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -461,7 +461,6 @@ def get_or_new_assembly(self, species_id, meta_session, source=None, existing=No assembly_default=self.get_meta_single_meta_key(species_id, "assembly.default"), tol_id=tol_id, created=func.now(), - # ensembl_name=self.get_meta_single_meta_key(species_id, "assembly.name"), assembly_uuid=str(uuid.uuid4()), url_name=self.get_meta_single_meta_key(species_id, "assembly.url_name"), is_reference=is_reference