From 1059a1581e3398bc76529fcc6c82cbe307ea4417 Mon Sep 17 00:00:00 2001 From: danielp Date: Wed, 6 Mar 2024 11:01:49 +0000 Subject: [PATCH 1/4] Updated updater to use dataset factory for assembly Fixed assembly sequence reload bug Fixed a few associated tests that had been commented out --- .../metadata/api/factories/datasets.py | 38 +++++--- .../production/metadata/updater/core.py | 91 ++++++------------- src/tests/test_dataset_factory.py | 3 +- src/tests/test_updater.py | 26 ++---- 4 files changed, 67 insertions(+), 91 deletions(-) diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py index 36057299..193f29b7 100644 --- a/src/ensembl/production/metadata/api/factories/datasets.py +++ b/src/ensembl/production/metadata/api/factories/datasets.py @@ -30,8 +30,18 @@ def create_all_child_datasets(self, session, dataset_uuid): top_level_dataset = self.__get_dataset(session, dataset_uuid) self.__create_child_datasets_recursive(session, top_level_dataset) - def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, - version): + def create_dataset(self, session, genome_input, dataset_source, dataset_type, dataset_attributes, name, label, + version, status="Submitted"): + # Check if genome_input is a UUID (string) or a Genome object + if isinstance(genome_input, str): + genome = session.query(Genome).filter(Genome.genome_uuid == genome_input).one() + elif isinstance(genome_input, Genome): + genome = genome_input + elif genome_input is None: + genome = None + else: + raise ValueError("Invalid genome input. Must be either a UUID string or a Genome object.") + new_dataset = Dataset( dataset_uuid=str(uuid.uuid4()), dataset_type=dataset_type, # Must be an object returned from the current session @@ -40,18 +50,21 @@ def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dat label=label, created=func.now(), dataset_source=dataset_source, # Must - status="Submitted", - ) - genome = session.query(Genome).filter(Genome.genome_uuid == genome_uuid).one() - new_genome_dataset = GenomeDataset( - genome=genome, - dataset=new_dataset, - is_current=False, + status=status, ) new_dataset_attributes = update_attributes(new_dataset, dataset_attributes, session) - session.add(new_genome_dataset) dataset_uuid = new_dataset.dataset_uuid - return dataset_uuid, new_dataset_attributes, new_genome_dataset + + if genome is not None: + new_genome_dataset = GenomeDataset( + genome=genome, + dataset=new_dataset, + is_current=False, + ) + session.add(new_genome_dataset) + return dataset_uuid, new_dataset, new_dataset_attributes, new_genome_dataset + else: + return dataset_uuid, new_dataset, new_dataset_attributes, None def get_parent_datasets(self, dataset_uuid, **kwargs): session = kwargs.get('session') @@ -136,7 +149,8 @@ def __create_child_datasets_recursive(self, session, parent_dataset): version = None # Create the child dataset - child_dataset_uuid, new_dataset_attributes, new_genome_dataset = self.create_dataset(session, genome_uuid, + child_dataset_uuid, new_dataset, new_dataset_attributes, new_genome_dataset = self.create_dataset(session, + genome_uuid, dataset_source, dataset_type, dataset_attributes, diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index f6726abf..e4e06d5c 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -9,25 +9,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.` -import re +import logging from collections import defaultdict + import sqlalchemy as db import sqlalchemy.exc - from ensembl.core.models import Meta, CoordSystem, SeqRegionAttrib, SeqRegion, \ SeqRegionSynonym, AttribType -from sqlalchemy import select, and_, create_engine +from ensembl.ncbi_taxonomy.api.utils import Taxonomy +from ensembl.ncbi_taxonomy.models import NCBITaxaName from sqlalchemy import or_ -from ensembl.database import DBConnection -from sqlalchemy.exc import NoResultFound, SQLAlchemyError -from sqlalchemy.orm import aliased, Session +from sqlalchemy import select, and_ +from sqlalchemy.exc import NoResultFound +from sqlalchemy.orm import aliased +from ensembl.production.metadata.api.exceptions import * +from ensembl.production.metadata.api.factories.datasets import DatasetFactory from ensembl.production.metadata.api.models import * from ensembl.production.metadata.updater.base import BaseMetaUpdater -from ensembl.ncbi_taxonomy.api.utils import Taxonomy -from ensembl.ncbi_taxonomy.models import NCBITaxaName -import logging -from ensembl.production.metadata.api.exceptions import * from ensembl.production.metadata.updater.updater_utils import update_attributes @@ -439,22 +438,25 @@ def get_or_new_assembly(self, species_id, meta_session, source=None, existing=No else: dataset_source = source + # This should return the existing objects if assembly is not None and existing is None: # Get the existing assembly dataset assembly_dataset = meta_session.query(Dataset).filter(Dataset.label == assembly_accession).one_or_none() # I should not need this, but double check on database updating. assembly_dataset_attributes = assembly_dataset.dataset_attributes assembly_sequences = assembly.assembly_sequences - return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source else: - is_reference = 1 if self.get_meta_single_meta_key(species_id, "assembly.is_reference") else 0 - with self.db.session_scope() as session: - level = (session.execute(db.select(CoordSystem.name).filter( - CoordSystem.species_id == species_id).order_by(CoordSystem.rank)).all())[0][0] - tol_id = self.get_meta_single_meta_key(species_id, "assembly.tol_id") + + attributes = self.get_meta_list_from_prefix_meta_key(species_id, "assembly") + if existing is None: + is_reference = 1 if self.get_meta_single_meta_key(species_id, "assembly.is_reference") else 0 + with self.db.session_scope() as session: + level = (session.execute(db.select(CoordSystem.name).filter( + CoordSystem.species_id == species_id).order_by(CoordSystem.rank)).all())[0][0] + tol_id = self.get_meta_single_meta_key(species_id, "assembly.tol_id") assembly = Assembly( ucsc_name=self.get_meta_single_meta_key(species_id, "assembly.ucsc_alias"), accession=self.get_meta_single_meta_key(species_id, "assembly.accession"), @@ -469,57 +471,24 @@ def get_or_new_assembly(self, species_id, meta_session, source=None, existing=No url_name=self.get_meta_single_meta_key(species_id, "assembly.url_name"), is_reference=is_reference ) - dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "assembly").first() - - if existing is None: - assembly_dataset = Dataset( - dataset_uuid=str(uuid.uuid4()), - dataset_type=dataset_type, # extract from dataset_type - name="assembly", - # version=None, Could be changed. - label=assembly.accession, # Required. Makes for a quick lookup - created=func.now(), - dataset_source=dataset_source, # extract from dataset_source - status='Submitted', - ) - else: - assembly_dataset = existing - assembly_dataset.dataset_source = dataset_source - - attributes = self.get_meta_list_from_prefix_meta_key(species_id, "assembly") - assembly_dataset_attributes = [] - # Should be able to delete the attribute creation. - for attribute, value in attributes.items(): - meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() - if meta_attribute is None: - meta_attribute = Attribute( - name=attribute, - label=attribute, - description=attribute, - type="string", - ) - # TODO re-add after 2500 - # raise Exception(f"{attribute} does not exist. Add it to the database and reload.") - dataset_attribute = DatasetAttribute( - value=value, - dataset=assembly_dataset, - attribute=meta_attribute, - ) - assembly_dataset_attributes.append(dataset_attribute) - if existing is None: + dataset_factory = DatasetFactory() + dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "assembly").first() + (dataset_uuid, assembly_dataset, assembly_dataset_attributes, + new_genome_dataset) = dataset_factory.create_dataset(meta_session, None, dataset_source, + dataset_type, attributes, "assembly", + assembly.accession, None, 'Processed') meta_session.add(assembly) meta_session.add(assembly_dataset) assembly_sequences = self.get_assembly_sequences(species_id, assembly) meta_session.add_all(assembly_sequences) - # Only reload the assembly sequences if the data is not released. - elif assembly.is_released(): + else: + assembly_dataset = existing + assembly_dataset.dataset_source = dataset_source + for dataset_attribute in assembly_dataset.dataset_attributes: + meta_session.delete(dataset_attribute) + assembly_dataset_attributes = update_attributes(assembly_dataset, attributes, meta_session) assembly_sequences = meta_session.query(AssemblySequence).filter( AssemblySequence.assembly_id == assembly.assembly_id) - else: - meta_session.query(AssemblySequence).filter( - AssemblySequence.assembly_id == assembly.assembly_id).delete() - assembly_sequences = self.get_assembly_sequences(species_id, assembly) - meta_session.add_all(assembly_sequences) meta_session.add_all(assembly_dataset_attributes) return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index f47d8354..36de601f 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -72,7 +72,8 @@ def test_create_dataset(self, multi_dbs): test_label = 'test_label' test_version = 'test_version' dataset_factory = DatasetFactory() - dataset_uuid, new_dataset_attributes, new_genome_dataset = dataset_factory.create_dataset(session, + dataset_uuid, created_dataset, new_dataset_attributes, new_genome_dataset = dataset_factory.create_dataset( + session, test_genome_uuid, test_dataset_source, test_dataset_type, diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index 371b449b..7c981bf7 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -9,23 +9,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re from pathlib import Path -from unittest import mock -from unittest.mock import Mock, patch import pytest -import re - -import sqlalchemy +from ensembl.core.models import Meta from ensembl.database import UnitTestDB, DBConnection -from ensembl.production.metadata.api.exceptions import UpdateBackCoreException, MetadataUpdateException from ensembl.production.metadata.api.factory import meta_factory from ensembl.production.metadata.api.models import Organism, Assembly, Dataset, AssemblySequence, DatasetAttribute, \ - DatasetSource, DatasetType, Attribute, Genome -from ensembl.core.models import Meta - -from ensembl.production.metadata.updater.core import CoreMetaUpdater + DatasetSource, DatasetType, Attribute db_directory = Path(__file__).parent / 'databases' db_directory = db_directory.resolve() @@ -161,7 +154,7 @@ def test_update_unreleased_no_force(self, multi_dbs): (AssemblySequence.name == 'TEST1_seqA')).first() # TODO Review this test after Proper discussion with GB / Variation / Etc about impact of changing sequences # in existing assembly - # assert old_seq is None + assert old_seq is not None datasets = session.query(Dataset) # Check that the old datasets have been removed count = session.query(Dataset).join(DatasetSource).filter( @@ -222,17 +215,16 @@ def test_update_released_force(self, multi_dbs): test = meta_factory(multi_dbs['core_9'].dbc.url, multi_dbs['ensembl_genome_metadata'].dbc.url, multi_dbs['ncbi_taxonomy'].dbc.url, force=True) # FIXME Should be run - # test.process_core() + test.process_core() metadata_db = DBConnection(multi_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: # Test that assembly seqs have not been updated - # new_seq = session.query(AssemblySequence).where( - # (AssemblySequence.name == 'TEST1_seq_BAD')).first() - # assert new_seq is None + new_seq = session.query(AssemblySequence).where( + (AssemblySequence.name == 'TEST1_seq_BAD')).first() + assert new_seq is None old_seq = session.query(AssemblySequence).where( (AssemblySequence.accession == 'BX284601.5')).first() - # FIXME - # assert old_seq is not None + assert old_seq is not None i = session.query(Dataset).join(DatasetSource).filter( DatasetSource.name == 'caenorhabditis_elegans_core_55_108_282' From c816123318b7771f4c11a911959ce4b26e895a9f Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 12 Mar 2024 09:43:24 +0000 Subject: [PATCH 2/4] Added checks for genebuild keys Updated source to handle compara Added homology dataset creation Upper case for common name Default species taxonomy id Fixed broken tests Added license for genome_uuid_manager.py --- .../metadata/api/factories/datasets.py | 14 +- .../metadata/scripts/genome_uuid_manager.py | 13 ++ .../production/metadata/updater/base.py | 13 +- .../production/metadata/updater/core.py | 125 +++++++++++------- src/tests/databases/core_1/meta.txt | 1 + src/tests/databases/core_2/meta.txt | 5 +- src/tests/databases/core_3/meta.txt | 3 +- src/tests/databases/core_4/meta.txt | 5 +- src/tests/databases/core_5/meta.txt | 7 +- src/tests/databases/core_6/meta.txt | 3 +- src/tests/databases/core_7/meta.txt | 9 +- src/tests/databases/core_8/meta.txt | 3 +- src/tests/databases/core_9/meta.txt | 7 +- src/tests/test_updater.py | 82 ++++-------- 14 files changed, 163 insertions(+), 127 deletions(-) diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py index 193f29b7..31a61668 100644 --- a/src/ensembl/production/metadata/api/factories/datasets.py +++ b/src/ensembl/production/metadata/api/factories/datasets.py @@ -52,7 +52,10 @@ def create_dataset(self, session, genome_input, dataset_source, dataset_type, da dataset_source=dataset_source, # Must status=status, ) - new_dataset_attributes = update_attributes(new_dataset, dataset_attributes, session) + if dataset_attributes is not None: + new_dataset_attributes = update_attributes(new_dataset, dataset_attributes, session) + else: + new_dataset_attributes = None dataset_uuid = new_dataset.dataset_uuid if genome is not None: @@ -135,7 +138,14 @@ def __create_child_datasets_recursive(self, session, parent_dataset): DatasetType.parent == parent_dataset_type.dataset_type_id).all() for child_type in child_dataset_types: - # Example placeholders for dataset properties + # Check if a dataset with the same type and genome exists + existing_datasets = session.query(Dataset).join(GenomeDataset).filter( + Dataset.dataset_type_id == child_type.dataset_type_id, + GenomeDataset.genome_id.in_([gd.genome_id for gd in parent_dataset.genome_datasets]) + ).all() + if any(d.status in ['Submitted', 'Processing'] for d in existing_datasets): + continue # Skip creation if any dataset is already Processed or Released + if len(parent_dataset.genome_datasets) > 1: raise ValueError("More than one genome linked to a genome_dataset") diff --git a/src/ensembl/production/metadata/scripts/genome_uuid_manager.py b/src/ensembl/production/metadata/scripts/genome_uuid_manager.py index bc89b675..e645ff1b 100644 --- a/src/ensembl/production/metadata/scripts/genome_uuid_manager.py +++ b/src/ensembl/production/metadata/scripts/genome_uuid_manager.py @@ -1,5 +1,18 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import logging + import mysql.connector logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py index 400cdc15..5eeb78c8 100644 --- a/src/ensembl/production/metadata/updater/base.py +++ b/src/ensembl/production/metadata/updater/base.py @@ -9,14 +9,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import sqlalchemy as db +from ensembl.database import DBConnection from sqlalchemy import inspect from sqlalchemy.engine import make_url -from ensembl.core.models import Meta -from ensembl.production.metadata.api.exceptions import UpdaterException -from ensembl.production.metadata.api.models import DatasetSource, Attribute, DatasetAttribute, Dataset -from ensembl.database import DBConnection +from ensembl.production.metadata.api.models import DatasetSource from ensembl.production.metadata.api.models import EnsemblRelease @@ -44,9 +41,11 @@ def is_object_new(self, obj): insp = inspect(obj) return insp.transient or insp.pending - def get_or_new_source(self, meta_session, db_type): + def get_or_new_source(self, meta_session, db_type, name=None): db_uri = self.db_uri - name = make_url(db_uri).database + if name is None: + # For core databases + name = make_url(db_uri).database dataset_source = meta_session.query(DatasetSource).filter(DatasetSource.name == name).one_or_none() if dataset_source is None: dataset_source = DatasetSource( diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index e4e06d5c..68d70fbc 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -9,7 +9,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.` -import logging from collections import defaultdict import sqlalchemy as db @@ -23,7 +22,6 @@ from sqlalchemy.exc import NoResultFound from sqlalchemy.orm import aliased -from ensembl.production.metadata.api.exceptions import * from ensembl.production.metadata.api.factories.datasets import DatasetFactory from ensembl.production.metadata.api.models import * from ensembl.production.metadata.updater.base import BaseMetaUpdater @@ -90,7 +88,7 @@ def process_species(self, species_id): This method contains the logic for updating the metadata """ - with self.metadata_db.session_scope() as meta_session: + with (self.metadata_db.session_scope() as meta_session): organism, division, organism_group_member = self.get_or_new_organism(species_id, meta_session) assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source = self.get_or_new_assembly( species_id, meta_session) @@ -152,6 +150,35 @@ def process_species(self, species_id): # Create genome and populate the database with assembly and dataset elif self.is_object_new(genebuild_dataset): + # Check that genest update or provider name has changed from last time. + + dataset_attr_alias1 = aliased(DatasetAttribute) + attribute_alias1 = aliased(Attribute) + dataset_attr_alias2 = aliased(DatasetAttribute) + attribute_alias2 = aliased(Attribute) + provider_name = self.get_meta_single_meta_key(species_id, "genebuild.provider_name") + geneset_update = self.get_meta_single_meta_key(species_id, "genebuild.last_geneset_update") + query = meta_session.query(Assembly).join( + Genome, Assembly.genomes + ).join(GenomeDataset, Genome.genome_datasets + ).join(Dataset, GenomeDataset.dataset + ).join(dataset_attr_alias1, Dataset.dataset_attributes + ).join(attribute_alias1, dataset_attr_alias1.attribute + ).join(dataset_attr_alias2, Dataset.dataset_attributes + ).join(attribute_alias2, dataset_attr_alias2.attribute + ).filter(Assembly.accession == assembly.accession, + Dataset.dataset_type.has(name="genebuild"), + and_( + attribute_alias1.name == "genebuild.provider_name", + dataset_attr_alias1.value == provider_name, + attribute_alias2.name == "genebuild.last_geneset_update", + dataset_attr_alias2.value == geneset_update + ) + ) + if meta_session.query(query.exists()).scalar(): + raise MetadataUpdateException( + "genebuild.provider_name or genebuild.last_geneset_update must be updated.") + logging.info('New genebuild') # Create genome and populate the database with genebuild dataset new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, @@ -238,6 +265,17 @@ def new_genome(self, meta_session, species_id, organism, assembly, assembly_data is_current=True, ) meta_session.add(genebuild_genome_dataset) + # Homology dataset creation + homology_uuid, homology_dataset, homology_dataset_attributes, homology_genome_dataset = self.new_homology( + meta_session, species_id, genome=new_genome) + meta_session.add(homology_genome_dataset) + + # Create children datasets here! + meta_session.commit() + dataset_factory = DatasetFactory() + dataset_factory.create_all_child_datasets(meta_session, genebuild_dataset.dataset_uuid) + dataset_factory.create_all_child_datasets(meta_session, homology_dataset.dataset_uuid) + return new_genome, assembly_genome_dataset, genebuild_genome_dataset def get_or_new_organism(self, species_id, meta_session): @@ -261,9 +299,14 @@ def get_or_new_organism(self, species_id, meta_session): NCBITaxaName.name_class == "genbank common name" ).one_or_none() common_name = common_name.name if common_name is not None else '-' + # Ensure that the first character is upper case. + common_name = common_name[0].upper() + common_name[1:] + species_taxonomy_id = self.get_meta_single_meta_key(species_id, "species.species_taxonomy_id") + if species_taxonomy_id is None: + species_taxonomy_id = taxid # Instantiate a new Organism object using data fetched from metadata. new_organism = Organism( - species_taxonomy_id=self.get_meta_single_meta_key(species_id, "species.species_taxonomy_id"), + species_taxonomy_id=species_taxonomy_id, taxonomy_id=self.get_meta_single_meta_key(species_id, "species.taxonomy_id"), common_name=common_name, scientific_name=self.get_meta_single_meta_key(species_id, "species.scientific_name"), @@ -498,14 +541,14 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F This method contains the logic for updating the metadata This is not a get, as we don't update the metadata for genebuild, only replace it if it is not released. """ - # The assembly accession and genebuild version are extracted from the metadata of the species assembly_accession = self.get_meta_single_meta_key(species_id, "assembly.accession") genebuild_version = self.get_meta_single_meta_key(species_id, "genebuild.version") - genebuild_start_date = self.get_meta_single_meta_key(species_id, "genebuild.start_date") - if None in (genebuild_version, genebuild_start_date, assembly_accession): - raise MissingMetaException("genebuild.version/genebuild.start_date/assembly.accession are all " - "required in the core database") - + if self.get_meta_single_meta_key(species_id, + "genebuild.provider_name") is None or self.get_meta_single_meta_key(species_id, + "genebuild.last_geneset_update") is None or self.get_meta_single_meta_key( + species_id, "genebuild.start_date") is None: + MissingMetaException( + "genebuild.provider_name, genebuild.last_geneset_update, genebuild.start_date are required keys") # The genebuild accession is formed by combining the assembly accession and the genebuild version genebuild_accession = assembly_accession + "_" + genebuild_version if source is None: @@ -514,52 +557,44 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F dataset_source = source dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "genebuild").first() - - genebuild_start_date = self.get_meta_single_meta_key(species_id, "genebuild.start_date") - genebuild_provider_name = self.get_meta_single_meta_key(species_id, "genebuild.provider_name") test_status = meta_session.query(Dataset).filter(Dataset.label == genebuild_accession).one_or_none() - if test_status: - # Check for genebuild.provider_name - provider_name_check = meta_session.query(DatasetAttribute).join(Attribute).filter( - DatasetAttribute.dataset_id == test_status.dataset_id, - Attribute.name == "genebuild.provider_name", - DatasetAttribute.value == genebuild_provider_name - ).one_or_none() - - if provider_name_check: - # Check for genebuild.start_date - start_date_check = meta_session.query(DatasetAttribute).join(Attribute).filter( - DatasetAttribute.dataset_id == test_status.dataset_id, - Attribute.name == "genebuild.start_date", - DatasetAttribute.value == genebuild_start_date - ).one_or_none() - - if start_date_check is None: - test_status = None - + # Return existing data if no update is required if test_status is not None and existing is False: genebuild_dataset = test_status genebuild_dataset_attributes = genebuild_dataset.dataset_attributes return genebuild_dataset, genebuild_dataset_attributes - + attributes = self.get_meta_list_from_prefix_meta_key(species_id, "genebuild.") if existing is False: - genebuild_dataset = Dataset( - dataset_uuid=str(uuid.uuid4()), - dataset_type=dataset_type, - name="genebuild", - version=genebuild_version, - label=genebuild_accession, - created=func.now(), - dataset_source=dataset_source, - status='Submitted', - ) + dataset_factory = DatasetFactory() + (dataset_uuid, genebuild_dataset, genebuild_dataset_attributes, + new_genome_dataset) = dataset_factory.create_dataset(meta_session, None, dataset_source, + dataset_type, attributes, "genebuild", + genebuild_accession, genebuild_version) else: genebuild_dataset = existing genebuild_dataset.label = genebuild_accession genebuild_dataset.dataset_source = dataset_source genebuild_dataset.version = genebuild_version + for dataset_attribute in genebuild_dataset.dataset_attributes: + meta_session.delete(dataset_attribute) + genebuild_dataset_attributes = update_attributes(genebuild_dataset, attributes, meta_session) - attributes = self.get_meta_list_from_prefix_meta_key(species_id, "genebuild.") - genebuild_dataset_attributes = update_attributes(genebuild_dataset, attributes, meta_session) return genebuild_dataset, genebuild_dataset_attributes + + def new_homology(self, meta_session, species_id, genome=None, source=None, dataset_attributes=None, version="1.0"): + if source is None: + production_name = self.get_meta_single_meta_key(species_id, "species.production_name") + db_version = self.get_meta_single_meta_key(species_id, "schema_version") + compara_name = production_name + "_compara_" + db_version + dataset_source = self.get_or_new_source(meta_session, "compara", name="compara_name") + else: + dataset_source = source + dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "homologies").first() + dataset_factory = DatasetFactory() + (dataset_uuid, homology_dataset, homology_dataset_attributes, + homology_genome_dataset) = dataset_factory.create_dataset(meta_session, genome, dataset_source, + dataset_type, dataset_attributes, + "compara_homologies", + "Compara homologies", version) + return dataset_uuid, homology_dataset, homology_dataset_attributes, homology_genome_dataset diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt index 4a5bd90c..ec4a6de6 100644 --- a/src/tests/databases/core_1/meta.txt +++ b/src/tests/databases/core_1/meta.txt @@ -21,3 +21,4 @@ 23 1 genebuild.provider_name test 24 1 genebuild.start_date test 25 1 assembly.alt_accession GCA_0000012345.3 +26 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt index 1cb67073..7596aa8a 100644 --- a/src/tests/databases/core_2/meta.txt +++ b/src/tests/databases/core_2/meta.txt @@ -19,5 +19,6 @@ 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 21 1 genome.genome_uuid test -23 1 genebuild.provider_name removed_for_test -24 1 genebuild.start_date test \ No newline at end of file +23 1 genebuild.provider_name test2 +24 1 genebuild.start_date test +25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt index cfb9971b..22ee1a53 100644 --- a/src/tests/databases/core_3/meta.txt +++ b/src/tests/databases/core_3/meta.txt @@ -18,4 +18,5 @@ 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name test -24 1 genebuild.start_date test \ No newline at end of file +24 1 genebuild.start_date test +25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index 07638144..373246a1 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -3,7 +3,7 @@ 13 1 assembly.name jaber01 11 1 assembly.ucsc_alias SCARYIER 15 1 gencode.version 999 -16 1 genebuild.last_geneset_update 02 +16 1 genebuild.last_geneset_update 04 3 1 species.common_name jabberwocky 7 1 species.division Ensembl_TEST 6 1 species.production_name Jabberwocky @@ -19,4 +19,5 @@ 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name test -24 1 genebuild.start_date test \ No newline at end of file +24 1 genebuild.start_date test +25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_5/meta.txt b/src/tests/databases/core_5/meta.txt index 44934b0e..d8bace62 100644 --- a/src/tests/databases/core_5/meta.txt +++ b/src/tests/databases/core_5/meta.txt @@ -1,5 +1,5 @@ 12 1 assembly.accession test1 -14 1 assembly.default test1 +14 1 assembly.default test846 13 1 assembly.name test1 11 1 assembly.ucsc_alias test1 7 1 species.division Ensembl_TEST @@ -14,5 +14,6 @@ 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test -23 1 genebuild.provider_name test -24 1 genebuild.start_date test \ No newline at end of file +23 1 genebuild.provider_name removed_for_test +24 1 genebuild.start_date test +25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_6/meta.txt b/src/tests/databases/core_6/meta.txt index b8ec8c16..7cc6d5d5 100644 --- a/src/tests/databases/core_6/meta.txt +++ b/src/tests/databases/core_6/meta.txt @@ -20,4 +20,5 @@ 20 1 strain.type test 21 1 genome.genome_uuid 90720316-006c-470b-a7dd-82d28f952264 23 1 genebuild.provider_name test -24 1 genebuild.start_date test \ No newline at end of file +24 1 genebuild.start_date test +25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_7/meta.txt b/src/tests/databases/core_7/meta.txt index 4a49cf4d..6c1cee60 100644 --- a/src/tests/databases/core_7/meta.txt +++ b/src/tests/databases/core_7/meta.txt @@ -1,4 +1,4 @@ -12 1 assembly.accession weird01 +12 1 assembly.accession test1 14 1 assembly.default NewTest 13 1 assembly.name jaber01 11 1 assembly.ucsc_alias test_alias @@ -18,7 +18,8 @@ 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test -23 1 genebuild.provider_name testforname -24 1 genebuild.start_date test +23 1 genebuild.provider_name testfornamenew +24 1 genebuild.start_date new 25 1 genebuild.havana_datafreeze_date test2 - +26 1 schema_version 110 +27 1 assembly.total_coding_sequence_length 8989 \ No newline at end of file diff --git a/src/tests/databases/core_8/meta.txt b/src/tests/databases/core_8/meta.txt index 0317becd..11f60c12 100644 --- a/src/tests/databases/core_8/meta.txt +++ b/src/tests/databases/core_8/meta.txt @@ -19,4 +19,5 @@ 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name test -24 1 genebuild.start_date test \ No newline at end of file +24 1 genebuild.start_date test +25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_9/meta.txt b/src/tests/databases/core_9/meta.txt index c374fa94..7764bd5b 100644 --- a/src/tests/databases/core_9/meta.txt +++ b/src/tests/databases/core_9/meta.txt @@ -1,4 +1,4 @@ -12 1 assembly.accession GCA_000002985.3 +12 1 assembly.accession test1 14 1 assembly.default jaber01 13 1 assembly.name jaber01 11 1 assembly.ucsc_alias SCARY @@ -14,11 +14,12 @@ 2 1 species.taxonomy_id 6239 10 1 species.type monsters 5 1 species.url Jabbe -17 1 genebuild.version EXT01 +17 1 genebuild.version ENS01 18 1 genebuild.sample_gene ENSAMXG00005000318 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genome.genome_uuid a733550b-93e7-11ec-a39d-005056b38ce3 24 1 genebuild.havana_datafreeze_date test2 25 1 assembly.total_genome_length 546 -26 1 genebuild.start_date test \ No newline at end of file +26 1 genebuild.start_date test +27 1 schema_version 110 diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index 7c981bf7..0e8fb481 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -88,7 +88,11 @@ def test_new_organism(self, multi_dbs): (AssemblySequence.is_circular == 0) & (AssemblySequence.name == 'TEST3_seqC') ).first() assert sequence3 is not None - + count = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( + DatasetSource.name.like('%compara%'), + DatasetType.name == 'compara_dumps' + ).count() + assert count == 1 def test_fail_existing_genome_uuid_no_data(self, multi_dbs): test = meta_factory(multi_dbs['core_2'].dbc.url, multi_dbs['ensembl_genome_metadata'].dbc.url, multi_dbs['ncbi_taxonomy'].dbc.url) @@ -130,7 +134,7 @@ def test_taxonomy_common_name(self, multi_dbs): metadata_db = DBConnection(multi_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: organism = session.query(Organism).where(Organism.biosample_id == 'test_case_5').first() - assert organism.common_name == 'sheep' + assert organism.common_name == 'Sheep' def test_fail_existing_genome_uuid_data_not_match(self, multi_dbs): test = meta_factory(multi_dbs['core_6'].dbc.url, multi_dbs['ensembl_genome_metadata'].dbc.url, @@ -147,35 +151,29 @@ def test_update_unreleased_no_force(self, multi_dbs): metadata_db = DBConnection(multi_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: # Test that assembly seqs have been updated - new_seq = session.query(AssemblySequence).where( - (AssemblySequence.name == 'TEST1_seq_update')).first() - assert new_seq is not None + new_seq = session.query(AssemblySequence).filter( + AssemblySequence.name == 'TEST1_seq_update').one_or_none() + assert new_seq is None old_seq = session.query(AssemblySequence).where( (AssemblySequence.name == 'TEST1_seqA')).first() - # TODO Review this test after Proper discussion with GB / Variation / Etc about impact of changing sequences - # in existing assembly assert old_seq is not None - datasets = session.query(Dataset) # Check that the old datasets have been removed - count = session.query(Dataset).join(DatasetSource).filter( - DatasetSource.name.like('%core_1'), - ).count() - # FIXME it looks like the count is actually 2 ==> there is a bug in there and the dataset has been - # duplicated !! assert count == 0 + genebuild_test = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( + DatasetSource.name.like('%core_5'), + ).filter(DatasetType.name == "genebuild").one_or_none() + assert genebuild_test is None # Check that the old attributes are gone count = session.query(DatasetAttribute).join(Attribute).filter( Attribute.name == 'assembly.default', - DatasetAttribute.value == 'jaber01' + DatasetAttribute.value == 'NewTest' ).count() - # FIXME it looks like the count is actually 2 ==> there is a bug in there and the dataset has been - # duplicated !! assert count == 0 + assert count == 1 count = session.query(DatasetAttribute).join(Attribute).filter( Attribute.name == 'genebuild.provider_name', DatasetAttribute.value == 'removed_for_test' ).count() - # FIXME it looks like the count is actually 2 ==> there is a bug in there and the dataset has been - # duplicated !! assert count == 0 + assert count == 0 # Check that the new dataset are present and not duplicated count = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( @@ -225,55 +223,27 @@ def test_update_released_force(self, multi_dbs): old_seq = session.query(AssemblySequence).where( (AssemblySequence.accession == 'BX284601.5')).first() assert old_seq is not None - - i = session.query(Dataset).join(DatasetSource).filter( - DatasetSource.name == 'caenorhabditis_elegans_core_55_108_282' - ) - # Check that the old datasets have been removed - count = session.query(Dataset).join(DatasetSource).filter( - DatasetSource.name == 'caenorhabditis_elegans_core_55_108_282' - ).count() - # FIXME - # assert count == 0 - # Check that the old attributes are gone - count = session.query(DatasetAttribute).join(Attribute).filter( - Attribute.name == 'total_coding_sequence_length', - DatasetAttribute.value == '24569601' - ).count() - # FIXME - # assert count == 0 - count = session.query(DatasetAttribute).join(Attribute).filter( - Attribute.name == 'ps_average_intron_length', - DatasetAttribute.value == '196.66' - ).count() - # FIXME - # assert count == 0 - # Check that the new dataset are present and not duplicated count = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( - DatasetSource.name.like('%core_9'), + DatasetSource.name.like('%core_7'), DatasetType.name == 'assembly' ).count() - # FIXME - # assert count == 1 + assert count == 0 + # Check that the new datasets exist count = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( DatasetSource.name.like('%core_9'), - DatasetType.name == 'genebuild' + DatasetType.name == 'assembly' ).count() - # FIXME - # assert count == 1 - # Check that the new attribute values are present + assert count == 1 + # Check that the old attributes are gone count = session.query(DatasetAttribute).join(Attribute).filter( - Attribute.name == 'assembly.total_genome_length', - DatasetAttribute.value == '546' + Attribute.name == 'assembly.total_coding_sequence_length', + DatasetAttribute.value == '8989' ).count() - # FIXME - # assert count > 0 - + assert count == 0 count = session.query(DatasetAttribute).join(Attribute).filter( Attribute.name == 'genebuild.havana_datafreeze_date', DatasetAttribute.value == 'test2' ).count() - # FIXME - # assert count > 0 + assert count == 1 From a799988fe87ab1d87f398501b68c5f9446c7ecc4 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 12 Mar 2024 09:44:57 +0000 Subject: [PATCH 3/4] Updated VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 10bf840e..f93ea0ca 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.1 \ No newline at end of file +2.0.2 \ No newline at end of file From 7216e5c8e969716af75b9768f054ff0b4ba1f651 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 12 Mar 2024 10:30:48 +0000 Subject: [PATCH 4/4] Minor Fix for example date and parenthesis --- VERSION | 2 +- src/ensembl/production/metadata/updater/core.py | 2 +- src/tests/databases/core_1/meta.txt | 2 +- src/tests/databases/core_2/meta.txt | 2 +- src/tests/databases/core_3/meta.txt | 2 +- src/tests/databases/core_4/meta.txt | 2 +- src/tests/databases/core_5/meta.txt | 2 +- src/tests/databases/core_6/meta.txt | 2 +- src/tests/databases/core_7/meta.txt | 2 +- src/tests/databases/core_8/meta.txt | 2 +- src/tests/databases/core_9/meta.txt | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/VERSION b/VERSION index f93ea0ca..10bf840e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.2 \ No newline at end of file +2.0.1 \ No newline at end of file diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 68d70fbc..39023688 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -88,7 +88,7 @@ def process_species(self, species_id): This method contains the logic for updating the metadata """ - with (self.metadata_db.session_scope() as meta_session): + with self.metadata_db.session_scope() as meta_session: organism, division, organism_group_member = self.get_or_new_organism(species_id, meta_session) assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source = self.get_or_new_assembly( species_id, meta_session) diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt index ec4a6de6..bcf484ff 100644 --- a/src/tests/databases/core_1/meta.txt +++ b/src/tests/databases/core_1/meta.txt @@ -19,6 +19,6 @@ 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name test -24 1 genebuild.start_date test +24 1 genebuild.start_date 2023-07-Ensembl 25 1 assembly.alt_accession GCA_0000012345.3 26 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt index 7596aa8a..0743014a 100644 --- a/src/tests/databases/core_2/meta.txt +++ b/src/tests/databases/core_2/meta.txt @@ -20,5 +20,5 @@ 20 1 strain.type test 21 1 genome.genome_uuid test 23 1 genebuild.provider_name test2 -24 1 genebuild.start_date test +24 1 genebuild.start_date 2023-07-Ensembl 25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt index 22ee1a53..daca8ee4 100644 --- a/src/tests/databases/core_3/meta.txt +++ b/src/tests/databases/core_3/meta.txt @@ -18,5 +18,5 @@ 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name test -24 1 genebuild.start_date test +24 1 genebuild.start_date 2023-07-Ensembl 25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index 373246a1..cf7d2852 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -19,5 +19,5 @@ 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name test -24 1 genebuild.start_date test +24 1 genebuild.start_date 2023-07-Ensembl 25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_5/meta.txt b/src/tests/databases/core_5/meta.txt index d8bace62..0576cf78 100644 --- a/src/tests/databases/core_5/meta.txt +++ b/src/tests/databases/core_5/meta.txt @@ -15,5 +15,5 @@ 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name removed_for_test -24 1 genebuild.start_date test +24 1 genebuild.start_date 2023-07-Ensembl 25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_6/meta.txt b/src/tests/databases/core_6/meta.txt index 7cc6d5d5..3a6a78f0 100644 --- a/src/tests/databases/core_6/meta.txt +++ b/src/tests/databases/core_6/meta.txt @@ -20,5 +20,5 @@ 20 1 strain.type test 21 1 genome.genome_uuid 90720316-006c-470b-a7dd-82d28f952264 23 1 genebuild.provider_name test -24 1 genebuild.start_date test +24 1 genebuild.start_date 2023-07-Ensembl 25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_7/meta.txt b/src/tests/databases/core_7/meta.txt index 6c1cee60..7c285648 100644 --- a/src/tests/databases/core_7/meta.txt +++ b/src/tests/databases/core_7/meta.txt @@ -19,7 +19,7 @@ 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name testfornamenew -24 1 genebuild.start_date new +24 1 genebuild.start_date 2023-08-Ensembl 25 1 genebuild.havana_datafreeze_date test2 26 1 schema_version 110 27 1 assembly.total_coding_sequence_length 8989 \ No newline at end of file diff --git a/src/tests/databases/core_8/meta.txt b/src/tests/databases/core_8/meta.txt index 11f60c12..bfe90127 100644 --- a/src/tests/databases/core_8/meta.txt +++ b/src/tests/databases/core_8/meta.txt @@ -19,5 +19,5 @@ 19 1 genebuild.sample_location KB871578.1:9766653-9817473 20 1 strain.type test 23 1 genebuild.provider_name test -24 1 genebuild.start_date test +24 1 genebuild.start_date 2023-07-Ensembl 25 1 schema_version 110 \ No newline at end of file diff --git a/src/tests/databases/core_9/meta.txt b/src/tests/databases/core_9/meta.txt index 7764bd5b..11f69da5 100644 --- a/src/tests/databases/core_9/meta.txt +++ b/src/tests/databases/core_9/meta.txt @@ -21,5 +21,5 @@ 23 1 genome.genome_uuid a733550b-93e7-11ec-a39d-005056b38ce3 24 1 genebuild.havana_datafreeze_date test2 25 1 assembly.total_genome_length 546 -26 1 genebuild.start_date test +26 1 genebuild.start_date 2023-07-Ensembl 27 1 schema_version 110