diff --git a/VERSION b/VERSION index 8cfbc905..867e5243 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.1 \ No newline at end of file +1.2.0 \ No newline at end of file diff --git a/requirements.in b/requirements.in index e10418da..7ccc4fef 100644 --- a/requirements.in +++ b/requirements.in @@ -1 +1 @@ -ensembl-py@git+https://github.com/Ensembl/ensembl-py.git@1.1.0.dev3 +ensembl-py@git+https://github.com/Ensembl/ensembl-py.git@1.1.1 diff --git a/requirements.txt b/requirements.txt index edc5b8e7..5d2d454a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ charset-normalizer==3.1.0 # via requests ensembl-hive @ git+https://github.com/Ensembl/ensembl-hive.git # via ensembl-py -ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git@1.1.0.dev3 +ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git@1.1.1 # via -r requirements.in exceptiongroup==1.1.1 # via pytest diff --git a/src/ensembl/production/metadata/api/base.py b/src/ensembl/production/metadata/api/base.py deleted file mode 100644 index d5289b5e..00000000 --- a/src/ensembl/production/metadata/api/base.py +++ /dev/null @@ -1,30 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from ensembl.database import DBConnection - - -##Todo: Add in OrganismAdapator. Subfunction fetches all organism in popular group. and # of genomes from distinct assemblies. -#Add in best genome (see doc) -#More functions for related genomes - - -class BaseAdaptor: - def __init__(self, metadata_uri): - self.metadata_db = DBConnection(metadata_uri) - - -def check_parameter(param): - if isinstance(param, tuple): - param = param[0] - if param is not None and not isinstance(param, list): - param = [param] - return param diff --git a/src/ensembl/production/metadata/api/dataset.py b/src/ensembl/production/metadata/api/dataset.py deleted file mode 100644 index 2893547f..00000000 --- a/src/ensembl/production/metadata/api/dataset.py +++ /dev/null @@ -1,38 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import sqlalchemy as db -from sqlalchemy.engine import make_url - -from ensembl.production.metadata.api.base import BaseAdaptor -from ensembl.production.metadata.api.models import GenomeDataset, Dataset -import logging - -logger = logging.getLogger(__name__) - - -class DatasetAdaptor(BaseAdaptor): - def __init__(self, metadata_uri): - super().__init__(metadata_uri) - - def check_release_status(self, dataset_uuid): - with self.metadata_db.session_scope() as session: - # Query to check if a release_id exists for the given genome_uuid - dataset_id = session.query(Dataset.dataset_id).filter(Dataset.dataset_uuid == dataset_uuid).scalar() - if dataset_id is None: - return "UUID not found" - - # Now we check if there exists a genome dataset with the corresponding dataset_id and a non-null release_id - result = session.query( - session.query(GenomeDataset).filter(GenomeDataset.dataset_id == dataset_id, - GenomeDataset.ensembl_release is not None).exists() - ).scalar() - return result diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py deleted file mode 100644 index d403386e..00000000 --- a/src/ensembl/production/metadata/api/genome.py +++ /dev/null @@ -1,650 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging - -import sqlalchemy as db -from sqlalchemy.orm import aliased -from ensembl.database import DBConnection -from ensembl.ncbi_taxonomy.models import NCBITaxaName -from ensembl.production.metadata.api.base import BaseAdaptor, check_parameter -from ensembl.production.metadata.api.models import Genome, Organism, Assembly, OrganismGroup, OrganismGroupMember, \ - GenomeRelease, EnsemblRelease, EnsemblSite, AssemblySequence, GenomeDataset, Dataset, DatasetType, DatasetSource, \ - Attribute, DatasetAttribute -import logging - - -logger = logging.getLogger(__name__) - - -class GenomeAdaptor(BaseAdaptor): - def __init__(self, metadata_uri, taxonomy_uri=None): - super().__init__(metadata_uri) - self.taxonomy_db = DBConnection(taxonomy_uri) - - def fetch_taxonomy_names(self, taxonomy_ids, synonyms=[]): - - taxonomy_ids = check_parameter(taxonomy_ids) - synonyms = [ - "common name", - "equivalent name", - "genbank synonym", - "synonym", - ] if len(check_parameter(synonyms)) == 0 else synonyms - required_class_name = ["genbank common name", "scientific name"] - taxons = {} - with self.taxonomy_db.session_scope() as session: - for tid in taxonomy_ids: - taxons[tid] = {"scientific_name": None, "genbank_common_name": None, "synonym": []} - - taxonomyname_query = db.select( - NCBITaxaName.name, - NCBITaxaName.name_class, - ).filter( - NCBITaxaName.taxon_id == tid, - NCBITaxaName.name_class.in_(required_class_name + synonyms), - ) - - for taxon_name in session.execute(taxonomyname_query).all(): - if taxon_name[1] in synonyms: - taxons[tid]['synonym'].append(taxon_name[0]) - if taxon_name[1] in required_class_name: - taxon_format_name = "_".join(taxon_name[1].split(' ')) - taxons[tid][taxon_format_name] = taxon_name[0] - return taxons - - def fetch_taxonomy_ids(self, taxonomy_names): - taxids = [] - taxonomy_names = check_parameter(taxonomy_names) - for taxon in taxonomy_names: - taxa_name_select = db.select( - NCBITaxaName.taxon_id - ).filter( - NCBITaxaName.name == taxon - ) - with self.taxonomy_db.session_scope() as session: - logger.debug(taxa_name_select) - taxid = session.execute(taxa_name_select).one() - taxids.append(taxid[0]) - return taxids - - def fetch_genomes(self, genome_id=None, genome_uuid=None, genome_tag=None, organism_uuid=None, assembly_uuid=None, - assembly_accession=None, assembly_name=None, use_default_assembly=False, ensembl_name=None, - taxonomy_id=None, group=None, group_type=None, allow_unreleased=False, unreleased_only=False, - site_name=None, release_type=None, release_version=None, current_only=True): - """ - Fetches genome information based on the specified parameters. - - Args: - genome_id (Union[int, List[int]]): The ID(s) of the genome(s) to fetch. - genome_uuid (Union[str, List[str]]): The UUID(s) of the genome(s) to fetch. - genome_tag (Union[str, List[str]]): genome_tag value is either in Assembly.url_name or told_id. - organism_uuid (Union[str, List[str]]): The UUID(s) of the organism(s) to fetch. - assembly_uuid (Union[str, List[str]]): The UUID(s) of the assembly(s) to fetch. - assembly_accession (Union[str, List[str]]): The assenbly accession of the assembly(s) to fetch. - assembly_name (Union[str, List[str]]): The name(s) of the assembly(s) to fetch. - use_default_assembly (bool): Whether to use default assembly name or not. - ensembl_name (Union[str, List[str]]): The Ensembl name(s) of the organism(s) to fetch. - taxonomy_id (Union[int, List[int]]): The taxonomy ID(s) of the organism(s) to fetch. - group (Union[str, List[str]]): The name(s) of the organism group(s) to filter by. - group_type (Union[str, List[str]]): The type(s) of the organism group(s) to filter by. - allow_unreleased (bool): Whether to fetch unreleased genomes too or not (default: False). - unreleased_only (bool): Fetch only unreleased genomes (default: False). allow_unreleased is used by gRPC - to fetch both released and unreleased genomes, while unreleased_only - is used in production pipelines (fetches only unreleased genomes) - site_name (str): The name of the Ensembl site to filter by. - release_type (str): The type of the Ensembl release to filter by. - release_version (int): The maximum version of the Ensembl release to filter by. - current_only (bool): Whether to fetch only current genomes. - - Returns: - List[Tuple[Genome, Organism, Assembly, EnsemblRelease]]: A list of tuples containing the fetched genome information. - Each tuple contains the following elements: - - Genome: An instance of the Genome class. - - Organism: An instance of the Organism class. - - Assembly: An instance of the Assembly class. - - EnsemblRelease: An instance of the EnsemblRelease class. - - Notes: - - The parameters are not mutually exclusive, meaning more than one of them can be provided at a time. - - The function uses a database session to execute the query and returns the results as a list of tuples. - - The results are ordered by the Ensembl name. - - Example usage: - genome_info = fetch_genomes(genome_id=12345) - """ - # Parameter validation - genome_id = check_parameter(genome_id) - genome_uuid = check_parameter(genome_uuid) - genome_tag = check_parameter(genome_tag) - organism_uuid = check_parameter(organism_uuid) - assembly_uuid = check_parameter(assembly_uuid) - assembly_accession = check_parameter(assembly_accession) - assembly_name = check_parameter(assembly_name) - ensembl_name = check_parameter(ensembl_name) - taxonomy_id = check_parameter(taxonomy_id) - group = check_parameter(group) - group_type = check_parameter(group_type) - - # Construct the initial database query - genome_select = db.select( - Genome, Organism, Assembly - ).select_from(Genome) \ - .join(Organism, Organism.organism_id == Genome.organism_id) \ - .join(Assembly, Assembly.assembly_id == Genome.assembly_id) \ - - # Apply group filtering if group parameter is provided - if group: - group_type = group_type if group_type else ['Division'] - genome_select = db.select( - Genome, Organism, Assembly, OrganismGroup, OrganismGroupMember - ).join(Genome.assembly).join(Genome.organism) \ - .join(Organism.organism_group_members) \ - .join(OrganismGroupMember.organism_group) \ - .filter(OrganismGroup.type.in_(group_type)).filter(OrganismGroup.name.in_(group)) - - # Apply additional filters based on the provided parameters - if genome_id is not None: - genome_select = genome_select.filter(Genome.genome_id.in_(genome_id)) - - if genome_uuid is not None: - genome_select = genome_select.filter(Genome.genome_uuid.in_(genome_uuid)) - - if genome_tag is not None: - genome_select = genome_select.filter( - db.or_( - Assembly.url_name.in_(genome_tag), - Assembly.tol_id.in_(genome_tag) - ) - ) - - if organism_uuid is not None: - genome_select = genome_select.filter(Organism.organism_uuid.in_(organism_uuid)) - - if assembly_uuid is not None: - genome_select = genome_select.filter(Assembly.assembly_uuid.in_(assembly_uuid)) - - if assembly_accession is not None: - genome_select = genome_select.filter(Assembly.accession.in_(assembly_accession)) - - if assembly_name is not None: - # case() function is used to conditionally select between columns, sql equivalent is: - # CASE - # WHEN :use_default_assembly = 1 THEN assembly.assembly_default - # ELSE assembly.name - # END - conditional_column = db.case( - # literal is used to prevent evaluating use_default_assembly to a boolean (True or False) - [(db.literal(use_default_assembly) == 1, Assembly.assembly_default)], - else_=Assembly.name - ) - lowered_assemblies = [name.lower() for name in assembly_name] - genome_select = genome_select.filter(db.func.lower(conditional_column).in_(lowered_assemblies)) - - if ensembl_name is not None: - genome_select = genome_select.filter(Organism.ensembl_name.in_(ensembl_name)) - - if taxonomy_id is not None: - genome_select = genome_select.filter(Organism.taxonomy_id.in_(taxonomy_id)) - - if allow_unreleased: - # fetch everything (released + unreleased) - pass - elif unreleased_only: - # fetch unreleased only - # this filter will get all Genome entries where there's no associated GenomeRelease - # the tilde (~) symbol is used for negation. - genome_select = genome_select.filter(~Genome.genome_releases.any()) - else: - # fetch released only - # Check if genome is released - # TODO: why did I add this check?! -> removing this breaks the test_update tests - with self.metadata_db.session_scope() as session: - session.expire_on_commit = False - # copy genome_select as we don't want to include GenomeDataset - # because it results in multiple row for a given genome (genome can have many datasets) - check_query = genome_select - prep_query = check_query.add_columns(GenomeDataset) \ - .join(GenomeDataset, Genome.genome_id == GenomeDataset.genome_id) \ - .filter(GenomeDataset.release_id.isnot(None)) - is_genome_released = session.execute(prep_query).first() - - if is_genome_released: - # Include release related info if released_only is True - genome_select = genome_select.add_columns(GenomeRelease, EnsemblRelease, EnsemblSite) \ - .join(GenomeRelease, Genome.genome_id == GenomeRelease.genome_id) \ - .join(EnsemblRelease, GenomeRelease.release_id == EnsemblRelease.release_id) \ - .join(EnsemblSite, EnsemblSite.site_id == EnsemblRelease.site_id) - - if release_version is not None and release_version > 0: - # if release is specified - genome_select = genome_select.filter(EnsemblRelease.version <= release_version) - current_only = False - - if current_only: - genome_select = genome_select.filter(GenomeRelease.is_current == 1) - - if site_name is not None: - genome_select = genome_select.add_columns(EnsemblSite).filter(EnsemblSite.name == site_name) - - if release_type is not None: - genome_select = genome_select.filter(EnsemblRelease.release_type == release_type) - - # print(f"genome_select query ====> {str(genome_select)}") - with self.metadata_db.session_scope() as session: - session.expire_on_commit = False - return session.execute(genome_select.order_by("ensembl_name")).all() - - def fetch_genomes_by_genome_uuid(self, genome_uuid, allow_unreleased=False, site_name=None, release_type=None, - release_version=None, current_only=True): - return self.fetch_genomes( - genome_uuid=genome_uuid, - allow_unreleased=allow_unreleased, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only, - ) - - def fetch_genomes_by_assembly_accession(self, assembly_accession, allow_unreleased=False, site_name=None, - release_type=None, release_version=None, current_only=True): - return self.fetch_genomes( - assembly_accession=assembly_accession, - allow_unreleased=allow_unreleased, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only, - ) - - def fetch_genomes_by_ensembl_name(self, ensembl_name, allow_unreleased=False, site_name=None, release_type=None, - release_version=None, current_only=True): - return self.fetch_genomes( - ensembl_name=ensembl_name, - allow_unreleased=allow_unreleased, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only, - ) - - def fetch_genomes_by_taxonomy_id(self, taxonomy_id, allow_unreleased=False, site_name=None, release_type=None, - release_version=None, current_only=True): - return self.fetch_genomes( - taxonomy_id=taxonomy_id, - allow_unreleased=allow_unreleased, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only, - ) - - def fetch_genomes_by_scientific_name( - self, - scientific_name, - allow_unreleased=False, - site_name=None, - release_type=None, - release_version=None, - current_only=True, - ): - taxonomy_ids = self.fetch_taxonomy_ids(scientific_name) - - return self.fetch_genomes_by_taxonomy_id( - taxonomy_ids, - allow_unreleased=allow_unreleased, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only, - ) - - def fetch_genome_by_keyword(self, keyword=None, release_version=None): - """ - Fetches genomes based on a keyword and release version. - - Args: - keyword (str or None): Keyword to search for in various attributes of genomes, assemblies, and organisms. - release_version (int or None): Release version to filter by. If set to 0 or None, fetches only current genomes. - - Returns: - list: A list of fetched genomes matching the keyword and release version. - """ - genome_query = db.select( - Genome, GenomeRelease, EnsemblRelease, Assembly, Organism, EnsemblSite - ).select_from(Genome) \ - .outerjoin(Organism, Organism.organism_id == Genome.organism_id) \ - .outerjoin(Assembly, Assembly.assembly_id == Genome.assembly_id) \ - .outerjoin(GenomeRelease, Genome.genome_id == GenomeRelease.genome_id) \ - .outerjoin(EnsemblRelease, GenomeRelease.release_id == EnsemblRelease.release_id) \ - .outerjoin(EnsemblSite, EnsemblSite.site_id == EnsemblRelease.site_id) \ - - if keyword is not None: - genome_query = genome_query.where(db.or_(db.func.lower(Assembly.tol_id) == keyword.lower(), - db.func.lower(Assembly.accession) == keyword.lower(), - db.func.lower(Assembly.name) == keyword.lower(), - db.func.lower(Assembly.ensembl_name) == keyword.lower(), - db.func.lower(Organism.common_name) == keyword.lower(), - db.func.lower(Organism.scientific_name) == keyword.lower(), - db.func.lower( - Organism.scientific_parlance_name) == keyword.lower(), - db.func.lower(Organism.species_taxonomy_id) == keyword.lower())) - - if release_version == 0 or release_version is None: - genome_query = genome_query.where(EnsemblRelease.is_current == 1) - else: - genome_query = genome_query.where(EnsemblRelease.version <= release_version) - - with self.metadata_db.session_scope() as session: - session.expire_on_commit = False - return session.execute(genome_query).all() - - def fetch_sequences(self, genome_id=None, genome_uuid=None, assembly_uuid=None, assembly_accession=None, - assembly_sequence_accession=None, assembly_sequence_name=None, chromosomal_only=False): - """ - Fetches sequences based on the provided parameters. - - Args: - genome_id (int or None): Genome ID to filter by. - genome_uuid (str or None): Genome UUID to filter by. - assembly_uuid (Union[str, List[str]]): The assembly_uuid of the assembly(s) to fetch. - assembly_accession (str or None): Assembly accession to filter by. - assembly_sequence_accession (str or None): Assembly Sequence accession to filter by. - assembly_sequence_name (str or None): Assembly Sequence name to filter by. - chromosomal_only (bool): Flag indicating whether to fetch only chromosomal sequences. - - Returns: - list: A list of fetched sequences. - """ - genome_id = check_parameter(genome_id) - genome_uuid = check_parameter(genome_uuid) - assembly_uuid = check_parameter(assembly_uuid) - assembly_accession = check_parameter(assembly_accession) - assembly_sequence_accession = check_parameter(assembly_sequence_accession) - assembly_sequence_name = check_parameter(assembly_sequence_name) - - seq_select = db.select( - Genome, Assembly, AssemblySequence - ).select_from(Genome) \ - .join(Assembly, Assembly.assembly_id == Genome.assembly_id) \ - .join(AssemblySequence, AssemblySequence.assembly_id == Assembly.assembly_id) - - if chromosomal_only: - seq_select = seq_select.filter(AssemblySequence.chromosomal == 1) - - # These options are in order of decreasing specificity, - # and thus the ones later in the list can be redundant. - if genome_id is not None: - seq_select = seq_select.filter(Genome.genome_id == genome_id) - - if genome_uuid is not None: - seq_select = seq_select.filter(Genome.genome_uuid == genome_uuid) - - if assembly_accession is not None: - seq_select = seq_select.filter(Assembly.accession == assembly_accession) - - if assembly_uuid is not None: - seq_select = seq_select.filter(Assembly.assembly_uuid.in_(assembly_uuid)) - - if assembly_sequence_accession is not None: - seq_select = seq_select.filter(AssemblySequence.accession == assembly_sequence_accession) - - if assembly_sequence_name is not None: - seq_select = seq_select.filter(AssemblySequence.name == assembly_sequence_name) - - with self.metadata_db.session_scope() as session: - session.expire_on_commit = False - return session.execute(seq_select).all() - - def fetch_sequences_by_genome_uuid(self, genome_uuid, chromosomal_only=False): - return self.fetch_sequences( - genome_uuid=genome_uuid, chromosomal_only=chromosomal_only - ) - - def fetch_sequences_by_assembly_accession( - self, assembly_accession, chromosomal_only=False - ): - return self.fetch_sequences( - assembly_accession=assembly_accession, chromosomal_only=chromosomal_only - ) - - def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid=None, allow_unreleased=False, - unreleased_only=False, dataset_uuid=None, dataset_name=None, dataset_source=None, - dataset_type=None, release_version=None, dataset_attributes=None): - """ - Fetches genome datasets based on the provided parameters. - - Args: - genome_id (int or list or None): Genome ID(s) to filter by. - genome_uuid (str or list or None): Genome UUID(s) to filter by. - organism_uuid (str or list or None): Organism UUID(s) to filter by. - allow_unreleased (bool): Flag indicating whether to allowing fetching unreleased datasets too or not. - unreleased_only (bool): Fetch only unreleased datasets (default: False). allow_unreleased is used by gRPC - to fetch both released and unreleased datasets, while unreleased_only - is used in production pipelines (fetches only unreleased datasets) - dataset_uuid (str or list or None): Dataset UUID(s) to filter by. - dataset_name (str or None): Dataset name to filter by, default is 'assembly'. - dataset_source (str or None): Dataset source to filter by. - dataset_type (str or None): Dataset type to filter by. - release_version (float or None): EnsemblRelease version to filter by. - dataset_attributes (bool): Flag to include dataset attributes - - Returns: - List[Tuple[ - Genome, GenomeDataset, Dataset, DatasetType, - DatasetSource, EnsemblRelease, DatasetAttribute, Attribute - ]]: A list of tuples containing the fetched genome information. - Each tuple contains the following elements: - - Genome: An instance of the Genome class. - - Organism: An instance of the Organism class. - - GenomeDataset: An instance of the GenomeDataset class. - - Dataset: An instance of the Dataset class. - - DatasetType: An instance of the DatasetType class. - - DatasetSource: An instance of the DatasetSource class. - - EnsemblRelease: An instance of the EnsemblRelease class. - - DatasetAttribute: An instance of the DatasetAttribute class. - - Attribute: An instance of the Attribute class. - - Raises: - ValueError: If an exception occurs during the fetch process. - - """ - try: - genome_select = db.select( - Genome, - GenomeDataset, - Dataset, - DatasetType, - DatasetSource, - ).select_from(Genome) \ - .join(GenomeDataset, Genome.genome_id == GenomeDataset.genome_id) \ - .join(Dataset, GenomeDataset.dataset_id == Dataset.dataset_id) \ - .join(DatasetType, Dataset.dataset_type_id == DatasetType.dataset_type_id) \ - .join(DatasetSource, Dataset.dataset_source_id == DatasetSource.dataset_source_id) - - # set default group topic as 'assembly' to fetch unique datasource - if not dataset_name: - dataset_name = "assembly" - - genome_id = check_parameter(genome_id) - genome_uuid = check_parameter(genome_uuid) - organism_uuid = check_parameter(organism_uuid) - dataset_uuid = check_parameter(dataset_uuid) - dataset_name = check_parameter(dataset_name) - dataset_source = check_parameter(dataset_source) - dataset_type = check_parameter(dataset_type) - - if genome_id is not None: - genome_select = genome_select.filter(Genome.genome_id.in_(genome_id)) - - if genome_uuid is not None: - genome_select = genome_select.filter(Genome.genome_uuid.in_(genome_uuid)) - - if organism_uuid is not None: - genome_select = genome_select.join(Organism, Organism.organism_id == Genome.organism_id) \ - .filter(Organism.organism_uuid.in_(organism_uuid)) - - if dataset_uuid is not None: - genome_select = genome_select.filter(Dataset.dataset_uuid.in_(dataset_uuid)) - - if "all" in dataset_name: - # TODO: fetch the list dynamically from the DB - dataset_type_names = [ - 'assembly', 'genebuild', 'variation', 'evidence', - 'regulation_build', 'homologies', 'regulatory_features' - ] - genome_select = genome_select.filter(DatasetType.name.in_(dataset_type_names)) - else: - genome_select = genome_select.filter(DatasetType.name.in_(dataset_name)) - - if dataset_source is not None: - genome_select = genome_select.filter(DatasetSource.name.in_(dataset_source)) - - if dataset_type is not None: - genome_select = genome_select.filter(DatasetType.name.in_(dataset_type)) - - if dataset_attributes: - genome_select = genome_select.add_columns(DatasetAttribute, Attribute)\ - .join(DatasetAttribute, DatasetAttribute.dataset_id == Dataset.dataset_id) \ - .join(Attribute, Attribute.attribute_id == DatasetAttribute.attribute_id) - - if allow_unreleased: - # Get everything - pass - elif unreleased_only: - # Get only unreleased datasets - # this filter will get all Datasets entries where there's no associated GenomeDataset - # the tilde (~) symbol is used for negation. - genome_select = genome_select.filter(~GenomeDataset.ensembl_release.has()) - else: - # Get released datasets only - # Check if dataset is released - with self.metadata_db.session_scope() as session: - # This is needed in order to ovoid tests throwing: - # sqlalchemy.orm.exc.DetachedInstanceError: Instance - # is not bound to a Session; attribute refresh operation cannot proceed - # (Background on this error at: https://sqlalche.me/e/14/bhk3) - session.expire_on_commit = False - # Check if GenomeDataset HAS an ensembl_release - prep_query = genome_select.filter(GenomeDataset.ensembl_release.has()) - is_dataset_released = session.execute(prep_query).first() - - if is_dataset_released: - # Include release related info - genome_select = genome_select.add_columns(EnsemblRelease) \ - .join(EnsemblRelease, GenomeDataset.release_id == EnsemblRelease.release_id) - - if release_version: - genome_select = genome_select.filter(EnsemblRelease.version <= release_version) - - # print(f"genome_select str ====> {str(genome_select)}") - logger.debug(genome_select) - with self.metadata_db.session_scope() as session: - session.expire_on_commit = False - return session.execute(genome_select).all() - - except Exception as e: - raise ValueError(str(e)) - - def fetch_genomes_info( - self, - genome_id=None, - genome_uuid=None, - allow_unreleased_genomes=False, - ensembl_name=None, - group=None, - group_type=None, - allow_unreleased_datasets=False, - dataset_name=None, - dataset_source=None, - dataset_attributes=True, - - ): - try: - genome_id = check_parameter(genome_id) - genome_uuid = check_parameter(genome_uuid) - ensembl_name = check_parameter(ensembl_name) - group = check_parameter(group) - group_type = check_parameter(group_type) - dataset_name = check_parameter(dataset_name) - dataset_source = check_parameter(dataset_source) - - if group is None: - group_type = group_type if group_type else ['Division'] - with self.metadata_db.session_scope() as session: - session.expire_on_commit = False - group = [org_type[0] for org_type in session.execute( - db.select(OrganismGroup.name).filter(OrganismGroup.type.in_(group_type))).all()] - - # get genome, assembly and organism information - genomes = self.fetch_genomes( - genome_id=genome_id, - genome_uuid=genome_uuid, - allow_unreleased=allow_unreleased_genomes, - ensembl_name=ensembl_name, - group=group, - group_type=group_type, - ) - - for genome in genomes: - dataset = self.fetch_genome_datasets( - genome_uuid=genome[0].genome_uuid, - allow_unreleased=allow_unreleased_datasets, - dataset_name=dataset_name, - dataset_source=dataset_source, - dataset_attributes=dataset_attributes - ) - res = [{'genome': genome, 'datasets': dataset}] - yield res - except Exception as e: - raise ValueError(str(e)) - - def fetch_organisms_group_counts(self, species_taxonomy_id=None, release_version=None, group_code='popular'): - o_species = aliased(Organism) - o = aliased(Organism) - if not release_version: - # Get latest released organisms - query = db.select( - o_species.species_taxonomy_id, - o_species.ensembl_name, - o_species.common_name, - o_species.scientific_name, - OrganismGroupMember.order.label('order'), - db.func.count().label('count') - ) - - query = query.join(o, o_species.species_taxonomy_id == o.species_taxonomy_id) - query = query.join(Genome, o.organism_id == Genome.organism_id) - query = query.join(Assembly, Genome.assembly_id == Assembly.assembly_id) - query = query.join(OrganismGroupMember, o_species.organism_id == OrganismGroupMember.organism_id) - query = query.join(OrganismGroup, - OrganismGroupMember.organism_group_id == OrganismGroup.organism_group_id) - query = query.filter(OrganismGroup.code == group_code) - - if species_taxonomy_id is not None: - query = query.filter(o_species.species_taxonomy_id == species_taxonomy_id) - - query = query.group_by( - o_species.species_taxonomy_id, - o_species.ensembl_name, - o_species.common_name, - o_species.scientific_name, - OrganismGroupMember.order - ) - query = query.order_by(OrganismGroupMember.order) - else: - # change group to release_version_state and related genomes - raise NotImplementedError('Not implemented yet') - pass - - with self.metadata_db.session_scope() as session: - # TODO check if we should return a dictionary instead - return session.execute(query).all() diff --git a/src/ensembl/production/metadata/api/models/__init__.py b/src/ensembl/production/metadata/api/models/__init__.py index 892065fa..840f3caa 100644 --- a/src/ensembl/production/metadata/api/models/__init__.py +++ b/src/ensembl/production/metadata/api/models/__init__.py @@ -4,3 +4,4 @@ from .genome import * from .organism import * from .release import * +from .utils import check_release_status diff --git a/src/ensembl/production/metadata/api/models/base.py b/src/ensembl/production/metadata/api/models/base.py index e64f354f..0a263fdc 100644 --- a/src/ensembl/production/metadata/api/models/base.py +++ b/src/ensembl/production/metadata/api/models/base.py @@ -13,3 +13,11 @@ Base = declarative_base() metadata = Base.metadata + + +class LoadAble(object): + def __repr__(self): + class_name = self.__class__.__name__ + attributes = {name: getattr(self, name) for name in dir(self) if + isinstance(getattr(self, name), (type(None), str, int, float, bool))} + return '<{}({})>'.format(class_name, attributes) \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 1bf6edaa..17c91a29 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -12,27 +12,28 @@ from sqlalchemy import Column, Integer, String, Enum, text, ForeignKey, Index from sqlalchemy.dialects.mysql import DATETIME from sqlalchemy.orm import relationship +from sqlalchemy.sql import func +import datetime import uuid -from ensembl.production.metadata.api.models.base import Base +from ensembl.production.metadata.api.models.base import Base, LoadAble -class Attribute(Base): +class Attribute(LoadAble, Base): __tablename__ = 'attribute' attribute_id = Column(Integer, primary_key=True) name = Column(String(128), nullable=False) label = Column(String(128), nullable=False) description = Column(String(255)) - type = Column(Enum('string', 'percent', 'float', 'integer', 'bp'), server_default=text("'string'")) + type = Column(Enum('string', 'percent', 'float', 'integer', 'bp', 'number'), server_default=text("'string'")) # One to many relationships # attribute_id within dataset attribute dataset_attributes = relationship("DatasetAttribute", back_populates='attribute') # many to one relationships # none - -class Dataset(Base): +class Dataset(LoadAble, Base): __tablename__ = 'dataset' dataset_id = Column(Integer, primary_key=True) @@ -40,7 +41,7 @@ class Dataset(Base): dataset_type_id = Column(ForeignKey('dataset_type.dataset_type_id'), nullable=False, index=True) name = Column(String(128), nullable=False) version = Column(String(128)) - created = Column(DATETIME(fsp=6), nullable=False) + created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow) dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) label = Column(String(128), nullable=False) status = Column(Enum('Submitted', 'Progressing', 'Processed'), server_default=text("'Submitted'")) @@ -56,7 +57,7 @@ class Dataset(Base): dataset_source = relationship('DatasetSource', back_populates="datasets") -class DatasetAttribute(Base): +class DatasetAttribute(LoadAble, Base): __tablename__ = 'dataset_attribute' __table_args__ = ( Index('dataset_attribute_dataset_id_attribute_id__d3b34d8c_uniq', 'dataset_id', 'attribute_id', 'value', @@ -76,7 +77,7 @@ class DatasetAttribute(Base): dataset = relationship('Dataset', back_populates="dataset_attributes") -class DatasetSource(Base): +class DatasetSource(LoadAble, Base): __tablename__ = 'dataset_source' dataset_source_id = Column(Integer, primary_key=True) @@ -89,7 +90,7 @@ class DatasetSource(Base): # none -class DatasetType(Base): +class DatasetType(LoadAble, Base): __tablename__ = 'dataset_type' dataset_type_id = Column(Integer, primary_key=True) diff --git a/src/ensembl/production/metadata/api/models/utils.py b/src/ensembl/production/metadata/api/models/utils.py index fa39d3fb..6a5bd48b 100644 --- a/src/ensembl/production/metadata/api/models/utils.py +++ b/src/ensembl/production/metadata/api/models/utils.py @@ -8,4 +8,20 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. +from .dataset import Dataset +from .genome import GenomeDataset + +def check_release_status(meta_dbc, dataset_uuid): + with meta_dbc.session_scope() as session: + # Query to check if a release_id exists for the given genome_uuid + dataset_id = session.query(Dataset.dataset_id).filter(Dataset.dataset_uuid == dataset_uuid).scalar() + if dataset_id is None: + return "UUID not found" + + # Now we check if there exists a genome dataset with the corresponding dataset_id and a non-null release_id + result = session.query( + session.query(GenomeDataset).filter(GenomeDataset.dataset_id == dataset_id, + GenomeDataset.ensembl_release is not None).exists() + ).scalar() + return result diff --git a/src/ensembl/production/metadata/api/release.py b/src/ensembl/production/metadata/api/release.py deleted file mode 100644 index 8637e922..00000000 --- a/src/ensembl/production/metadata/api/release.py +++ /dev/null @@ -1,174 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging - -import sqlalchemy as db - -from ensembl.production.metadata.api.base import check_parameter, BaseAdaptor -from ensembl.production.metadata.api.models import EnsemblRelease, EnsemblSite, GenomeRelease, Genome, GenomeDataset, \ - Dataset - -logger = logging.getLogger(__name__) - - -class ReleaseAdaptor(BaseAdaptor): - - def fetch_releases( - self, - release_id=None, - release_version=None, - current_only=True, - release_type=None, - site_name=None, - ): - """ - Fetches releases based on the provided parameters. - - Args: - release_id (int or list or None): Release ID(s) to filter by. - release_version (str or list or None): Release version(s) to filter by. - current_only (bool): Flag indicating whether to fetch only current releases. - release_type (str or list or None): Release type(s) to filter by. - site_name (str or list or None): Name(s) of the Ensembl site to filter by. - - Returns: - list: A list of fetched releases. - """ - release_id = check_parameter(release_id) - release_version = check_parameter(release_version) - release_type = check_parameter(release_type) - site_name = check_parameter(site_name) - - release_select = db.select( - EnsemblRelease, EnsemblSite - ).join(EnsemblRelease.ensembl_site) - - # WHERE ensembl_release.release_id = :release_id_1 - if release_id is not None: - release_select = release_select.filter( - EnsemblRelease.release_id.in_(release_id) - ) - # WHERE ensembl_release.version = :version_1 - elif release_version is not None: - release_select = release_select.filter( - EnsemblRelease.version.in_(release_version) - ) - # WHERE ensembl_release.is_current =:is_current_1 - elif current_only: - release_select = release_select.filter( - EnsemblRelease.is_current == 1 - ) - - # WHERE ensembl_release.release_type = :release_type_1 - if release_type is not None: - release_select = release_select.filter( - EnsemblRelease.release_type.in_(release_type) - ) - - # WHERE ensembl_site.name = :name_1 - if site_name is not None: - release_select = release_select.filter( - EnsemblSite.name.in_(site_name) - ) - logger.debug(f"Query: {release_select}") - with self.metadata_db.session_scope() as session: - session.expire_on_commit = False - return session.execute(release_select).all() - - def fetch_releases_for_genome(self, genome_uuid, site_name=None): - - # SELECT genome_release.release_id - # FROM genome_release - # JOIN genome ON genome.genome_id = genome_release.genome_id - # WHERE genome.genome_uuid =:genome_uuid_1 - release_id_select = db.select( - GenomeRelease.release_id - ).filter( - Genome.genome_uuid == genome_uuid - ).join( - GenomeRelease.genome - ) - - release_ids = [] - with self.metadata_db.session_scope() as session: - release_objects = session.execute(release_id_select).all() - for rid in release_objects: - release_ids.append(rid[0]) - release_ids = list(dict.fromkeys(release_ids)) - return self.fetch_releases(release_id=release_ids, site_name=site_name) - - def fetch_releases_for_dataset(self, dataset_uuid, site_name=None): - - # SELECT genome_release.release_id - # FROM genome_dataset - # JOIN dataset ON dataset.dataset_id = genome_dataset.dataset_id - # WHERE dataset.dataset_uuid = :dataset_uuid_1 - release_id_select = db.select( - GenomeDataset.release_id - ).filter( - Dataset.dataset_uuid == dataset_uuid - ).join( - GenomeDataset.dataset - ) - - release_ids = [] - with self.metadata_db.session_scope() as session: - release_objects = session.execute(release_id_select).all() - for rid in release_objects: - release_ids.append(rid[0]) - release_ids = list(dict.fromkeys(release_ids)) - return self.fetch_releases(release_id=release_ids, site_name=site_name) - - -class NewReleaseAdaptor(BaseAdaptor): - - def __init__(self, metadata_uri=None): - super().__init__(metadata_uri) - # Get current release ID from ensembl_release - with self.metadata_db.session_scope() as session: - self.current_release_id = ( - session.execute(db.select(EnsemblRelease.release_id).filter(EnsemblRelease.is_current == 1)).one()[0]) - if self.current_release_id == "": - raise Exception("Current release not found") - logger.debug(f'Release ID: {self.current_release_id}') - - # Get last release ID from ensembl_release - with self.metadata_db.session_scope() as session: - ############### Refactor this once done. It is messy. - current_version = int(session.execute( - db.select(EnsemblRelease.version).filter(EnsemblRelease.release_id == self.current_release_id)).one()[ - 0]) - past_versions = session.execute( - db.select(EnsemblRelease.version).filter(EnsemblRelease.version < current_version)).all() - sorted_versions = [] - # Do I have to account for 1.12 and 1.2 - for version in past_versions: - sorted_versions.append(float(version[0])) - sorted_versions.sort() - self.previous_release_id = (session.execute( - db.select(EnsemblRelease.release_id).filter(EnsemblRelease.version == sorted_versions[-1])).one()[0]) - if self.previous_release_id == "": - raise Exception("Previous release not found") - - # new_genomes (list of new genomes in the new release) - def fetch_new_genomes(self): - # TODO: this code must be never called yet, because it would never work!!!! - with self.metadata_db.session_scope() as session: - genome_selector = db.select( - EnsemblRelease, EnsemblSite - ).join(EnsemblRelease.ensembl_site) - old_genomes = session.execute( - db.select(EnsemblRelease.version).filter(EnsemblRelease.version < current_version)).all() - new_genomes = [] - novel_old_genomes = [] - novel_new_genomes = [] - return session.execute(release_select).all() diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt index 13778f64..a983dcf8 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt @@ -1,5 +1,5 @@ -1 eeaaa2bf-151c-4848-8b85-a05a9993101e hg38 GCA_000001405.28 chromosome GRCh38.p13 \N GRCh38 \N 2023-05-12 13:30:58 GRCh38.p13 \N 1 grch38 -2 633034c3-2268-40a2-866a-9f492cac84bf hg19 GCA_000001405.14 chromosome GRCh37.p13 \N GRCh37 mHomSap1 2023-05-12 13:32:06 GRCh37.p13 \N 0 \N +1 eeaaa2bf-151c-4848-8b85-a05a9993101e hg38 GCA_000001405.28 chromosome GRCh38.p13 \N GRCh38 \N 2023-05-12 13:30:58 GRCh38.p13 \N 1 GRCh38 +2 633034c3-2268-40a2-866a-9f492cac84bf hg19 GCA_000001405.14 chromosome GRCh37.p13 \N GRCh37 \N 2023-05-12 13:32:06 GRCh37.p13 \N 0 GRCh37 3 f78618ef-1075-47ee-a496-be26cad47912 \N GCA_000005845.2 chromosome ASM584v2 \N ASM584v2 \N 2023-05-12 13:32:14 ASM584v2 \N 0 \N 4 224d836f-36a7-4c4e-b917-ecff740e404f \N GCA_000002765.2 chromosome ASM276v2 \N ASM276v2 \N 2023-05-12 13:32:25 ASM276v2 \N 0 \N 5 ec1c4b53-c2ef-431c-ad0e-b2aef19b44f1 \N GCA_900519105.1 chromosome IWGSC \N IWGSC \N 2023-05-12 13:32:36 IWGSC \N 0 \N diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql b/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql index 979f65f3..355b4bfb 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql @@ -112,8 +112,8 @@ CREATE TABLE dataset_attribute dataset_attribute_id int auto_increment primary key, value varchar(128) null, - attribute_id int not null, - dataset_id int not null, + attribute_id int not null, + dataset_id int not null, constraint dataset_attribute_dataset_id_attribute_id__d3b34d8c_uniq unique (dataset_id, attribute_id), constraint dataset_attribute_attribute_id_55c51407_fk_attribute @@ -176,8 +176,8 @@ CREATE TABLE genome genome_uuid varchar(128) not null, assembly_id int not null, organism_id int not null, - created datetime(6) not null, - is_best tinyint(1) not null default 0, + created datetime(6) not null, + is_best tinyint(1) not null default 0, constraint genome_uuid unique (genome_uuid), @@ -194,13 +194,13 @@ CREATE TABLE genome_dataset dataset_id int not null, genome_id int not null, release_id int null, - is_current tinyint(1) not null default 0, + is_current tinyint(1) not null default 0, constraint ensembl_metadata_gen_dataset_id_26d7bac7_fk_dataset_d - foreign key (dataset_id) references dataset (dataset_id), + foreign key (dataset_id) references dataset (dataset_id) on DELETE CASCADE, constraint ensembl_metadata_gen_genome_id_7670a2c5_fk_genome_ge - foreign key (genome_id) references genome (genome_id), + foreign key (genome_id) references genome (genome_id) ON DELETE CASCADE, constraint ensembl_metadata_gen_release_id_c5440b9a_fk_ensembl_r - foreign key (release_id) references ensembl_release (release_id) + foreign key (release_id) references ensembl_release (release_id) ON DELETE CASCADE ); CREATE TABLE genome_release diff --git a/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_node.txt b/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_node.txt index c1e7baab..9dcbd60d 100644 --- a/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_node.txt +++ b/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_node.txt @@ -93,3 +93,4 @@ 2301116 6236 suborder 0 2481947 2484234 1 2301119 2301116 infraorder 0 2483018 2484233 1 2698737 2759 clade 0 4728124 4803255 1 +666668 2759 clade 0 0 0 1 \ No newline at end of file diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 399a52b7..1c2f9f40 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -13,16 +13,15 @@ from collections import defaultdict import sqlalchemy as db from ensembl.core.models import Meta, CoordSystem, SeqRegionAttrib, SeqRegion, \ - SeqRegionSynonym, AttribType, ExternalDb -from sqlalchemy import select, func + SeqRegionSynonym, AttribType +from sqlalchemy import select from sqlalchemy import or_ -from sqlalchemy.orm import aliased from ensembl.database import DBConnection from sqlalchemy.exc import NoResultFound -from ensembl.production.metadata.api.genome import GenomeAdaptor -from ensembl.production.metadata.api.dataset import DatasetAdaptor from ensembl.production.metadata.api.models import * from ensembl.production.metadata.updater.base import BaseMetaUpdater +from ensembl.ncbi_taxonomy.api.utils import Taxonomy +from ensembl.ncbi_taxonomy.models import NCBITaxaName import logging class CoreMetaUpdater(BaseMetaUpdater): @@ -86,8 +85,7 @@ def process_species(self, species_id, metadata_uri, taxonomy_uri, db_uri): self.genebuild_dataset, self.genebuild_dataset_attributes, \ genebuild_status = self.new_genebuild(species_id, meta_session, db_uri, self.dataset_source) - conn = DatasetAdaptor(metadata_uri=metadata_uri) - genebuild_release_status = conn.check_release_status(self.genebuild_dataset.dataset_uuid) + genebuild_release_status = check_release_status(DBConnection(metadata_uri), self.genebuild_dataset.dataset_uuid) if organism_status == "New": logging.info('New organism') @@ -177,7 +175,7 @@ def get_or_new_organism(self, species_id, meta_session, metadata_uri, taxonomy_u """ Get an existing Organism instance or create a new one, depending on the information from the metadata database. """ - + tdbc = DBConnection(taxonomy_uri) # Fetch the Ensembl name of the organism from metadata using either 'species.ensembl_name' # or 'species.production_name' as the key. ensembl_name = self.get_meta_single_meta_key(species_id, "organism.ensembl_name") @@ -188,9 +186,13 @@ def get_or_new_organism(self, species_id, meta_session, metadata_uri, taxonomy_u common_name = self.get_meta_single_meta_key(species_id, "species.common_name") if common_name is None: taxid = self.get_meta_single_meta_key(species_id, "species.taxonomy_id") - temp_adapt = GenomeAdaptor(metadata_uri, taxonomy_uri) - names = temp_adapt.fetch_taxonomy_names(taxid) - common_name = names[taxid]["genbank_common_name"] + + with tdbc.session_scope() as session: + common_name = session.query(NCBITaxaName).filter( + NCBITaxaName.taxon_id == taxid, + NCBITaxaName.name_class == "genbank common name" + ).one_or_none().name + common_name = common_name if common_name is not None else '-' # Instantiate a new Organism object using data fetched from metadata. new_organism = Organism( species_taxonomy_id=self.get_meta_single_meta_key(species_id, "species.species_taxonomy_id"), @@ -220,11 +222,11 @@ def get_or_new_organism(self, species_id, meta_session, metadata_uri, taxonomy_u # If no existing Organism is found, conduct additional checks before creating a new one. # Check if the new organism's taxonomy ID exists in the taxonomy database. - conn = GenomeAdaptor(metadata_uri=metadata_uri, taxonomy_uri=taxonomy_uri) - try: - conn.fetch_taxonomy_names(taxonomy_ids=new_organism.taxonomy_id) - except NoResultFound: - raise Exception("taxid not found in taxonomy database for scientific name") + with tdbc.session_scope() as session: + try: + Taxonomy.fetch_node_by_id(session, new_organism.taxonomy_id) + except NoResultFound: + raise RuntimeError(f"taxon id {new_organism.taxonomy_id} not found in taxonomy database for scientific name") # Check if an Assembly with the same accession already exists in the metadata database. accession = self.get_meta_single_meta_key(species_id, "assembly.accession") diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index eb922ff2..990e4b74 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -14,7 +14,7 @@ 2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe -17 1 genebuild.version 1 +17 1 genebuild.version 2 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test \ No newline at end of file diff --git a/src/tests/databases/core_5/attrib_type.txt b/src/tests/databases/core_5/attrib_type.txt new file mode 100644 index 00000000..de5f1880 --- /dev/null +++ b/src/tests/databases/core_5/attrib_type.txt @@ -0,0 +1,2 @@ +6 toplevel Top Level Top Level Non-Redundant Sequence Region +547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_5/coord_system.txt b/src/tests/databases/core_5/coord_system.txt new file mode 100644 index 00000000..51314bf1 --- /dev/null +++ b/src/tests/databases/core_5/coord_system.txt @@ -0,0 +1 @@ +1 1 primary_assembly test 1 default_version,sequence_level diff --git a/src/tests/databases/core_5/meta.txt b/src/tests/databases/core_5/meta.txt new file mode 100644 index 00000000..768e54bc --- /dev/null +++ b/src/tests/databases/core_5/meta.txt @@ -0,0 +1,16 @@ +12 1 assembly.accession test1 +14 1 assembly.default test1 +13 1 assembly.name test1 +11 1 assembly.ucsc_alias test1 +7 1 species.division Ensembl_TEST +6 1 species.production_name Hominoide +4 1 species.scientific_name Hominoide +8 1 species.strain reference +9 1 species.strain_group Hominoide +2 1 species.taxonomy_id 314295 +10 1 species.type monsters +5 1 species.url Hominoide +17 1 genebuild.version 1 +18 1 sample.gene_param ENSAMXG00005000318 +19 1 sample.location_param KB871578.1:9766653-9817473 +20 1 strain.type test diff --git a/src/tests/databases/core_5/seq_region.txt b/src/tests/databases/core_5/seq_region.txt new file mode 100644 index 00000000..a2216feb --- /dev/null +++ b/src/tests/databases/core_5/seq_region.txt @@ -0,0 +1,3 @@ +1 TEST1_seq 1 666666 +2 TEST2_seq 1 666 +3 TEST3_seq 1 1666666 diff --git a/src/tests/databases/core_5/seq_region_attrib.txt b/src/tests/databases/core_5/seq_region_attrib.txt new file mode 100644 index 00000000..d8dcda33 --- /dev/null +++ b/src/tests/databases/core_5/seq_region_attrib.txt @@ -0,0 +1,6 @@ +1 6 1 +2 6 1 +3 6 1 +1 547 nuclear_chromosome +2 547 nuclear_chromosome +3 547 mitochondrial_chromosome diff --git a/src/tests/databases/core_5/seq_region_synonym.txt b/src/tests/databases/core_5/seq_region_synonym.txt new file mode 100644 index 00000000..de43d915 --- /dev/null +++ b/src/tests/databases/core_5/seq_region_synonym.txt @@ -0,0 +1,3 @@ +1 1 TEST1_seq 50710 +2 2 TEST2_seq 50710 +3 3 TEST3_seq 50710 diff --git a/src/tests/databases/core_5/table.sql b/src/tests/databases/core_5/table.sql new file mode 100644 index 00000000..22e5c915 --- /dev/null +++ b/src/tests/databases/core_5/table.sql @@ -0,0 +1,87 @@ +CREATE TABLE coord_system +( + coord_system_id int unsigned auto_increment + primary key, + species_id int unsigned default 1 not null, + name varchar(40) not null, + version varchar(255) null, + `rank` int not null, + attrib set ('default_version', 'sequence_level') null, + constraint name_idx + unique (name, version, species_id), + constraint rank_idx + unique (`rank`, species_id) +); + +CREATE INDEX species_idx + on coord_system (species_id); + +CREATE TABLE meta +( + meta_id int auto_increment + primary key, + species_id int unsigned default 1 null, + meta_key varchar(40) not null, + meta_value varchar(255) not null, + constraint species_key_value_idx + unique (species_id, meta_key, meta_value) +); + +CREATE INDEX species_value_idx + on meta (species_id, meta_value); + +CREATE TABLE seq_region +( + seq_region_id int unsigned auto_increment + primary key, + name varchar(255) not null, + coord_system_id int unsigned not null, + length int unsigned not null, + constraint name_cs_idx + unique (name, coord_system_id) +); + +CREATE INDEX cs_idx + on seq_region (coord_system_id); + +CREATE TABLE seq_region_attrib +( + seq_region_id int unsigned default 0 not null, + attrib_type_id smallint unsigned default 0 not null, + value text not null, + constraint region_attribx + unique (seq_region_id, attrib_type_id, value(500)) +); + +CREATE INDEX seq_region_idx + on seq_region_attrib (seq_region_id); + +CREATE INDEX type_val_idx + on seq_region_attrib (attrib_type_id, value(40)); + +CREATE INDEX val_only_idx + on seq_region_attrib (value(40)); + +CREATE TABLE seq_region_synonym +( + seq_region_synonym_id int unsigned auto_increment + primary key, + seq_region_id int unsigned not null, + synonym varchar(250) not null, + external_db_id int unsigned null, + constraint syn_idx + unique (synonym, seq_region_id) +); + +CREATE INDEX seq_region_idx + on seq_region_synonym (seq_region_id); + +CREATE TABLE `attrib_type` ( + `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, + `code` varchar(20) NOT NULL DEFAULT '', + `name` varchar(255) NOT NULL DEFAULT '', + `description` text, + PRIMARY KEY (`attrib_type_id`), + UNIQUE KEY `code_idx` (`code`) +); + diff --git a/src/tests/test_api.py b/src/tests/test_api.py deleted file mode 100644 index 00533a7a..00000000 --- a/src/tests/test_api.py +++ /dev/null @@ -1,427 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Unit tests for api module -""" - -import pytest - -from ensembl.database import UnitTestDB -from ensembl.production.metadata.api.genome import GenomeAdaptor -from ensembl.production.metadata.api.release import ReleaseAdaptor - - -@pytest.mark.parametrize("multi_dbs", [[{'src': 'ensembl_metadata'}, {'src': 'ncbi_taxonomy'}]], - indirect=True) -class TestMetadataDB: - dbc = None # type: UnitTestDB - - def test_load_database(self, multi_dbs): - db_test = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) - assert db_test, "DB should not be empty" - - @pytest.mark.parametrize( - "allow_unreleased, unreleased_only, current_only, output_count", - [ - # fetches everything (7 released + 2 unreleased) - (True, False, True, 9), - # fetches all released genomes (with current_only=0) - (False, False, False, 7), - # fetches released genomes with current_only=1 (default) - (False, False, True, 6), - # fetches all unreleased genomes - (False, True, True, 2), - ] - ) - def test_fetch_all_genomes(self, multi_dbs, allow_unreleased, unreleased_only, current_only, output_count): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes( - allow_unreleased=allow_unreleased, - unreleased_only=unreleased_only, - current_only=current_only - ) - assert len(test) == output_count - - def test_fetch_with_all_args_no_conflict(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes( - genome_uuid="a733550b-93e7-11ec-a39d-005056b38ce3", - assembly_accession="GCA_000002985.3", - assembly_name="WBcel235", - ensembl_name="caenorhabditis_elegans", - taxonomy_id="6239", - group="EnsemblMetazoa", - allow_unreleased=False, - site_name="Ensembl", - release_type="integrated", - release_version="108.0", - current_only=True - ) - assert len(test) == 0 - - def test_fetch_with_all_args_conflict(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes( - genome_uuid="a733550b-93e7-11ec-a39d-005056b38ce3", - assembly_accession="GCA_000002985.3", - assembly_name="WBcel235", - ensembl_name="caenorhabditis_elegans", - taxonomy_id="9606", # Conflicting taxonomy_id - group="EnsemblBacteria", # Conflicting group - allow_unreleased=False, - site_name="Ensembl", - release_type="integrated", - release_version="108.0", - current_only=True - ) - assert len(test) == 0 - - def test_fetch_releases(self, multi_dbs): - conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) - test = conn.fetch_releases(release_id=2) - # test the one to many connection - assert test[0].EnsemblSite.name == 'Ensembl' - assert test[0].EnsemblSite.label == 'Ensembl Genome Browser' - # test the direct access. - assert test[0].EnsemblRelease.label == 'Scaling Phase 1' - - # currently only have one release, so the testing is not comprehensive - def test_fetch_releases_for_genome(self, multi_dbs): - conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) - test = conn.fetch_releases_for_genome('a73351f7-93e7-11ec-a39d-005056b38ce3') - assert test[0].EnsemblSite.name == 'Ensembl' - - def test_fetch_releases_for_dataset(self, multi_dbs): - conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) - test = conn.fetch_releases_for_dataset('3316fe1a-83e7-46da-8a56-cf2b693d8060') - assert test[0].EnsemblSite.name == 'Ensembl' - - def test_fetch_taxonomy_names(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_taxonomy_names(taxonomy_ids=[6239, 511145]) - assert test[511145]['scientific_name'] == 'Escherichia coli str. K-12 substr. MG1655' - - def test_fetch_taxonomy_ids(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_taxonomy_ids(taxonomy_names='Caenorhabditis elegans') - assert test[0] == 6239 - - def test_fetch_genomes(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes(genome_uuid='a7335667-93e7-11ec-a39d-005056b38ce3') - assert test[0].Organism.scientific_name == 'Homo sapiens' - - # def test_fetch_genomes_by_group_division(self, multi_dbs): - # conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - # taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - # division_filter = 'EnsemblVertebrates' - # test = conn.fetch_genomes(group=division_filter) - # assert len(test) == 1 -# Other PR will likely change this drastically, so the effort is not really necessary. Their are 7 groups. -# assert division_filter in division_results - - def test_fetch_genomes_by_genome_uuid(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes_by_genome_uuid('a733550b-93e7-11ec-a39d-005056b38ce3') - assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' - - def test_fetch_genome_by_ensembl_and_assembly_name(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes(assembly_name='WBcel235', ensembl_name='caenorhabditis_elegans') - assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' - - def test_fetch_genomes_by_assembly_accession(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes_by_assembly_accession('GCA_000005845.2') - assert test[0].Organism.scientific_name == 'Escherichia coli str. K-12 substr. MG1655 str. K12 (GCA_000005845)' - - def test_fetch_genomes_by_assembly_sequence_accession(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_sequences( - genome_uuid='a7335667-93e7-11ec-a39d-005056b38ce3', - assembly_accession='GCA_000001405.28', - assembly_sequence_accession='CM000686.2' - ) - assert test[0].AssemblySequence.name == 'Y' - - def test_fetch_genomes_by_assembly_sequence_accession_empty(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_sequences( - genome_uuid='s0m3-r4nd0m-g3n3-uu1d-v4lu3', - assembly_accession='GCA_000001405.28', - assembly_sequence_accession='CM000686.2' - ) - assert len(test) == 0 - - def test_fetch_genomes_by_ensembl_name(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes_by_ensembl_name('caenorhabditis_elegans') - assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' - - def test_fetch_genomes_by_taxonomy_id(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes_by_taxonomy_id(6239) - assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' - - def test_fetch_genomes_by_scientific_name(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes_by_scientific_name( - scientific_name='Caenorhabditis elegans', - site_name='Ensembl' - ) - assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' - - def test_fetch_sequences(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_sequences(assembly_uuid='eeaaa2bf-151c-4848-8b85-a05a9993101e') - # this test is going to drive me nuts - # Locally and on GitLab CI/CD: AssemblySequence.accession == 'CHR_HG107_PATCH' - # in Travis, its: AssemblySequence.accession == 'KI270757.1' - # to please bothI'm using 'sequence_location' for now - assert test[0].AssemblySequence.sequence_location == 'SO:0000738' - - @pytest.mark.parametrize( - "genome_uuid, assembly_accession, chromosomal_only, expected_output", - [ - # Chromosomal and non-chromosomal - ("a7335667-93e7-11ec-a39d-005056b38ce3", "GCA_000001405.28", False, 0), - # Chromosomal only - ("a7335667-93e7-11ec-a39d-005056b38ce3", "GCA_000001405.28", True, 1), - ] - ) - def test_fetch_sequences_chromosomal(self, multi_dbs, genome_uuid, assembly_accession, chromosomal_only, expected_output): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_sequences( - genome_uuid=genome_uuid, - assembly_accession=assembly_accession, - chromosomal_only=chromosomal_only - ) - assert test[-1].AssemblySequence.chromosomal == expected_output - - @pytest.mark.parametrize( - "genome_uuid, assembly_sequence_name, chromosomal_only, expected_output", - [ - ("a7335667-93e7-11ec-a39d-005056b38ce3", "MT", False, "J01415.2"), - ("a7335667-93e7-11ec-a39d-005056b38ce3", "LRG_778", False, "LRG_778"), - ("a7335667-93e7-11ec-a39d-005056b38ce3", "LRG_778", True, None), - ("some-random-genome-uuid", "LRG_778", False, None), - ("a7335667-93e7-11ec-a39d-005056b38ce3", "fake_assembly_name", False, None), - ("some-random-genome-uuid", "fake_assembly_name", False, None), - ] - ) - def test_fetch_sequences_by_assembly_seq_name(self, multi_dbs, genome_uuid, assembly_sequence_name, chromosomal_only, expected_output): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_sequences( - genome_uuid=genome_uuid, - assembly_sequence_name=assembly_sequence_name, - chromosomal_only=chromosomal_only - ) - for result in test: - assert result.AssemblySequence.accession == expected_output - - @pytest.mark.parametrize( - "genome_uuid, dataset_uuid, allow_unreleased, unreleased_only, expected_dataset_uuid, expected_count", - [ - # nothing specified + allow_unreleased -> fetches everything - (None, None, True, False, "559d7660-d92d-47e1-924e-e741151c2cef", 33), - # specifying genome_uuid - ("a73357ab-93e7-11ec-a39d-005056b38ce3", None, False, False, "b4ff55e3-d06a-4772-bb13-81c3207669e3", 5), - # specifying dataset_uuid - (None, "0dc05c6e-2910-4dbd-879a-719ba97d5824", False, False, "0dc05c6e-2910-4dbd-879a-719ba97d5824", 1), - # fetch unreleased datasets only - (None, None, False, True, "feaa37ea-4217-4d9d-afca-600bdae11b36", 3), - ] - ) - def test_fetch_genome_dataset_all( - self, multi_dbs, genome_uuid, - dataset_uuid, allow_unreleased, - unreleased_only, expected_dataset_uuid, - expected_count - ): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genome_datasets( - genome_uuid=genome_uuid, - dataset_uuid=dataset_uuid, - unreleased_only=unreleased_only, - allow_unreleased=allow_unreleased, - # fetch all datasets (default: dataset_name="assembly") - dataset_name="all" - ) - assert test[0].Dataset.dataset_uuid == expected_dataset_uuid - assert len(test) == expected_count - - @pytest.mark.parametrize( - "organism_uuid, expected_count", - [ - # homo_sapien - ("db2a5f09-2db8-429b-a407-c15a4ca2876d", 11), - # e-coli - ("21279e3e-e651-43e1-a6fc-79e390b9e8a8", 3), - # non-existing organism - ("organism-yet-to-be-discovered", 0), - ] - ) - def test_fetch_genome_dataset_by_organism_uuid(self, multi_dbs, organism_uuid, expected_count): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genome_datasets( - organism_uuid=organism_uuid, - # fetch all datasets (default: dataset_name="assembly") - dataset_name="all" - ) - assert len(test) == expected_count - - @pytest.mark.parametrize( - "ensembl_name, assembly_name, use_default_assembly, expected_output", - [ - ("homo_sapiens", "GRCh37.p13", False, "3704ceb1-948d-11ec-a39d-005056b38ce3"), - ("homo_sapiens", "GRCh37", True, "3704ceb1-948d-11ec-a39d-005056b38ce3"), - ] - ) - def test_fetch_genome_uuid(self, multi_dbs, ensembl_name, assembly_name, use_default_assembly, expected_output): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes( - ensembl_name=ensembl_name, - assembly_name=assembly_name, - use_default_assembly=use_default_assembly, - allow_unreleased=False, - current_only=False - ) - assert len(test) == 1 - assert test[0].Genome.genome_uuid == expected_output - - @pytest.mark.parametrize( - "ensembl_name, assembly_name, use_default_assembly, expected_output", - [ - ("homo_sapiens", "GRCh38.p13", False, "a7335667-93e7-11ec-a39d-005056b38ce3"), - ("homo_sapiens", "GRCh38", True, "a7335667-93e7-11ec-a39d-005056b38ce3"), - ] - ) - def test_fetch_genome_uuid_is_current(self, multi_dbs, ensembl_name, assembly_name, use_default_assembly, expected_output): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes( - ensembl_name=ensembl_name, - assembly_name=assembly_name, - use_default_assembly=use_default_assembly, - allow_unreleased=False - ) - assert len(test) == 1 - assert test[0].Genome.genome_uuid == expected_output - - @pytest.mark.parametrize( - "ensembl_name, assembly_name, use_default_assembly", - [ - ("homo_sapiens", "GRCh37", False), - ("homo_sapiens", "GRCh37.p13", True), - ] - ) - def test_fetch_genome_uuid_empty(self, multi_dbs, ensembl_name, assembly_name, use_default_assembly): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes( - ensembl_name=ensembl_name, - assembly_name=assembly_name, - use_default_assembly=use_default_assembly - ) - assert len(test) == 0 - - @pytest.mark.parametrize( - "species_taxonomy_id, expected_organism, expected_assemblies_count", - [ - # fetch everything - (None, "Human", 3), - # fetch Triticum aestivum only - (4565, "Triticum aestivum", 1), - ] - ) - def test_fetch_organisms_group_counts(self, multi_dbs, species_taxonomy_id, expected_organism, expected_assemblies_count): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_organisms_group_counts(species_taxonomy_id=species_taxonomy_id) - # When fetching everything: - # First result should be Human - assert test[0][2] == expected_organism - # We should have three assemblies associated with Human (Two for grch37.38 organism + one t2t) - assert test[0][5] == expected_assemblies_count - for data in test[1:]: - # All others have only one genome in test DB - assert data[5] == 1 - - @pytest.mark.parametrize( - "allow_unreleased, output_count, expected_genome_uuid", - [ - # fetches everything - (True, 9, "90720316-006c-470b-a7dd-82d28f952264"), - # fetches released datasets and genomes with current_only=1 (default) - (False, 6, "a733550b-93e7-11ec-a39d-005056b38ce3"), - ] - ) - def test_fetch_genomes_info(self, multi_dbs, allow_unreleased, output_count, expected_genome_uuid): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes_info( - allow_unreleased_genomes=allow_unreleased, - allow_unreleased_datasets=allow_unreleased, - group_type=['division', 'internal'] - ) - output_to_list = list(test) - assert len(output_to_list) == output_count - assert output_to_list[0][0]['genome'].Genome.genome_uuid == expected_genome_uuid - - @pytest.mark.parametrize( - "genome_tag, current_only, expected_output", - [ - # url_name = GRCh38 => homo_sapien 38 - ("GRCh38", True, "a7335667-93e7-11ec-a39d-005056b38ce3"), - # tol_id = mHomSap1 => homo_sapien 37 - # I randomly picked up this tol_id, probably wrong (biologically speaking) - ("mHomSap1", False, "3704ceb1-948d-11ec-a39d-005056b38ce3"), - ] - ) - def test_fetch_genome_uuid_by_tag(self, multi_dbs, genome_tag, current_only, expected_output): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes( - genome_tag=genome_tag, - current_only=current_only, - allow_unreleased=False, - ) - assert len(test) == 1 - assert test[0].Genome.genome_uuid == expected_output - - def test_fetch_genome_uuid_by_tag_empty(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genomes( - genome_tag="iDontExist" - ) - assert len(test) == 0 diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index ad33a7f8..80404801 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -12,11 +12,11 @@ from pathlib import Path import pytest -from sqlalchemy import create_engine, MetaData, Table, select +import re -from ensembl.database import UnitTestDB +from ensembl.database import UnitTestDB, DBConnection from ensembl.production.metadata.api.factory import meta_factory -from ensembl.production.metadata.api.genome import GenomeAdaptor +from ensembl.production.metadata.api.models import Organism, Assembly, Dataset db_directory = Path(__file__).parent / 'databases' db_directory = db_directory.resolve() @@ -24,7 +24,8 @@ @pytest.mark.parametrize("multi_dbs", [[{'src': 'ensembl_metadata'}, {'src': 'ncbi_taxonomy'}, {'src': db_directory / 'core_1'}, {'src': db_directory / 'core_2'}, - {'src': db_directory / 'core_3'}, {'src': db_directory / 'core_4'}]], + {'src': db_directory / 'core_3'}, {'src': db_directory / 'core_4'}, + {'src': db_directory / 'core_5'}]], indirect=True) class TestUpdater: dbc = None # type: UnitTestDB @@ -34,57 +35,65 @@ def test_new_organism(self, multi_dbs): multi_dbs['ncbi_taxonomy'].dbc.url) test.process_core() # Look for organism, assembly and geneset - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) # Test the species - test_collect = conn.fetch_genomes_by_ensembl_name('Jabberwocky') - assert test_collect[0].Organism.scientific_name == 'carol_jabberwocky' - # Test the Assembly - assert test_collect[0].Assembly.accession == 'weird01' - # select * from genebuild where version = 999 and name = 'genebuild and label =01 - engine = create_engine(multi_dbs['ensembl_metadata'].dbc.url) - metadata = MetaData() - dataset = Table('dataset', metadata, autoload=True, autoload_with=engine) - query = select([dataset]).where( - (dataset.c.version == 1) & (dataset.c.name == 'genebuild') - ) - row = engine.execute(query).fetchone() - assert row is not None - if row is not None: - assert row[4] is not None + with metadata_db.session_scope() as session: + organism = session.query(Organism).where(Organism.ensembl_name == 'Jabberwocky').first() + assembly = session.query(Assembly).where(Assembly.name == 'jaber01').first() + assert organism.scientific_name == 'carol_jabberwocky' + # Test the Assembly + assert assembly.accession == 'weird01' + # select * from genebuild where version = 999 and name = 'genebuild and label =01 + dataset = session.query(Dataset).where( + (Dataset.version == 1) & (Dataset.name == 'genebuild') + ).first() + assert dataset is not None + assert re.match(".*_core_1", dataset.dataset_source.name) + assert dataset.dataset_source.type == "core" + assert dataset.dataset_type.name == "genebuild" # def test_update_organism(self, multi_dbs): test = meta_factory(multi_dbs['core_2'].dbc.url, multi_dbs['ensembl_metadata'].dbc.url, multi_dbs['ncbi_taxonomy'].dbc.url) test.process_core() - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test_collect = conn.fetch_genomes_by_ensembl_name('Jabberwocky') - assert test_collect[0].Organism.scientific_name == 'carol_jabberwocky' + metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + with metadata_db.session_scope() as session: + organism = session.query(Organism).where(Organism.ensembl_name == 'Jabberwocky').first() + assert organism.scientific_name == 'carol_jabberwocky' + assert len(organism.genomes) == 1 def test_update_assembly(self, multi_dbs): test = meta_factory(multi_dbs['core_3'].dbc.url, multi_dbs['ensembl_metadata'].dbc.url, multi_dbs['ncbi_taxonomy'].dbc.url) test.process_core() - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test_collect = conn.fetch_genomes_by_ensembl_name('Jabberwocky') - assert test_collect[1].Organism.scientific_name == 'carol_jabberwocky' - assert test_collect[1].Assembly.accession == 'weird02' + metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + with metadata_db.session_scope() as session: + organism = session.query(Organism).where(Organism.ensembl_name == 'Jabberwocky').first() + assert organism.scientific_name == 'carol_jabberwocky' + assert organism.genomes[1].assembly.accession == 'weird02' # def test_update_geneset(self, multi_dbs): test = meta_factory(multi_dbs['core_4'].dbc.url, multi_dbs['ensembl_metadata'].dbc.url, multi_dbs['ncbi_taxonomy'].dbc.url) test.process_core() - engine = create_engine(multi_dbs['ensembl_metadata'].dbc.url) - metadata = MetaData() - dataset = Table('dataset', metadata, autoload=True, autoload_with=engine) - query = select([dataset]).where( - (dataset.c.version == 1) & (dataset.c.name == 'genebuild') - ) - row = engine.execute(query).fetchone() - assert row is not None - if row is not None: - assert row[4] is not None + metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).where( + (Dataset.version == 2) & (Dataset.name == 'genebuild') + ).first() + assert dataset is not None + assert re.match(".*_core_4", dataset.dataset_source.name) + assert dataset.dataset_source.type == "core" + assert dataset.dataset_type.name == "genebuild" + + + def test_taxonomy_common_name(self, multi_dbs): + test = meta_factory(multi_dbs['core_5'].dbc.url, multi_dbs['ensembl_metadata'].dbc.url, + multi_dbs['ncbi_taxonomy'].dbc.url) + test.process_core() + metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + with metadata_db.session_scope() as session: + organism = session.query(Organism).where(Organism.ensembl_name == 'Hominoide').first() + assert organism.common_name == 'apes'