diff --git a/src/ensembl/production/metadata/grpc/adaptors/genome.py b/src/ensembl/production/metadata/grpc/adaptors/genome.py index dca934e9..e341406d 100644 --- a/src/ensembl/production/metadata/grpc/adaptors/genome.py +++ b/src/ensembl/production/metadata/grpc/adaptors/genome.py @@ -13,6 +13,7 @@ from typing import List, Tuple import sqlalchemy as db +from sqlalchemy import desc from sqlalchemy.orm import aliased from ensembl.database import DBConnection from ensembl.ncbi_taxonomy.models import NCBITaxaName @@ -382,9 +383,9 @@ def fetch_sequences(self, genome_id=None, genome_uuid=None, assembly_uuid=None, list: A list of fetched sequences. """ genome_id = check_parameter(genome_id) - genome_uuid = check_parameter(genome_uuid) + # genome_uuid = check_parameter(genome_uuid) assembly_uuid = check_parameter(assembly_uuid) - assembly_accession = check_parameter(assembly_accession) + # assembly_accession = check_parameter(assembly_accession) assembly_sequence_accession = check_parameter(assembly_sequence_accession) assembly_sequence_name = check_parameter(assembly_sequence_name) @@ -416,10 +417,10 @@ def fetch_sequences(self, genome_id=None, genome_uuid=None, assembly_uuid=None, if assembly_sequence_name is not None: seq_select = seq_select.filter(AssemblySequence.name == assembly_sequence_name) - + logger.debug(f'Query {seq_select}') with self.metadata_db.session_scope() as session: session.expire_on_commit = False - return session.execute(seq_select).all() + return session.execute(seq_select.order_by(AssemblySequence.accession)).all() def fetch_sequences_by_genome_uuid(self, genome_uuid, chromosomal_only=False): return self.fetch_sequences( @@ -493,7 +494,7 @@ def fetch_genome_datasets(self, genome_id: (int | List[int]) = None, .join(Dataset, GenomeDataset.dataset_id == Dataset.dataset_id) \ .join(DatasetType, Dataset.dataset_type_id == DatasetType.dataset_type_id) \ .join(DatasetSource, Dataset.dataset_source_id == DatasetSource.dataset_source_id).order_by( - Genome.genome_uuid, Dataset.dataset_uuid) + Genome.genome_uuid, Dataset.dataset_uuid).distinct() if genome_id is not None: logger.debug(f"Filter on genome_id {genome_id}") @@ -598,13 +599,13 @@ def fetch_genomes_info( ): try: - genome_id = check_parameter(genome_id) - genome_uuid = check_parameter(genome_uuid) + # genome_id = check_parameter(genome_id) + # genome_uuid = check_parameter(genome_uuid) ensembl_name = check_parameter(ensembl_name) group = check_parameter(group) group_type = check_parameter(group_type) - dataset_type_name = check_parameter(dataset_type_name) - dataset_source = check_parameter(dataset_source) + # dataset_type_name = check_parameter(dataset_type_name) + # dataset_source = check_parameter(dataset_source) if group is None: group_type = group_type if group_type else ['Division'] @@ -622,18 +623,24 @@ def fetch_genomes_info( group=group, group_type=group_type, ) - - for genome in genomes: - # FIXME, looping over each genome could be costly, like very costly. - # Better approach could be to pre-fetch related objects - dataset = self.fetch_genome_datasets( - genome_uuid=genome[0].genome_uuid, + genomes_uuids = [genome[0].genome_uuid for genome in genomes] + genomes_datasets = self.fetch_genome_datasets( + genome_uuid=genomes_uuids, allow_unreleased=allow_unreleased_datasets, dataset_type_name=dataset_type_name, dataset_source=dataset_source, dataset_attributes=dataset_attributes ) - res = [{'genome': genome, 'datasets': dataset}] + agglo = {} + logger.debug(f'genome datasets {genomes_datasets[0]}') + for genome_infos in genomes_datasets: + if genome_infos[0].genome_uuid not in agglo.keys(): + agglo[genome_infos[0].genome_uuid] = {'genome': genome_infos[0], 'datasets': []} + if genome_infos[2] not in agglo[genome_infos[0].genome_uuid]['datasets']: + agglo[genome_infos[0].genome_uuid]['datasets'].append(genome_infos[2]) + for genome_uuid, data in agglo.items(): + logger.debug(f'genome_uuid: {genome_uuid}, datasets {data["datasets"]}') + res = [{'genome': data['genome'], 'datasets': data['datasets']}] yield res except Exception as e: raise ValueError(str(e)) diff --git a/src/tests/test_grpc.py b/src/tests/test_grpc.py index 3660b6cd..b54bb9fd 100644 --- a/src/tests/test_grpc.py +++ b/src/tests/test_grpc.py @@ -228,7 +228,7 @@ def test_fetch_sequences_chromosomal(self, multi_dbs, genome_uuid, assembly_acce assembly_accession=assembly_accession, chromosomal_only=chromosomal_only ) - logger.debug(f"Retrieved {test[0:2]}") + logger.debug(f"Retrieved {test[0]}") assert test[-1].AssemblySequence.chromosomal == expected_output @pytest.mark.parametrize( @@ -408,14 +408,12 @@ def test_fetch_related_assemblies_count(self, multi_dbs, organism_uuid, expected "allow_unreleased, output_count, expected_genome_uuid", [ # fetches everything - (True, 283, "0e1a1b4a-efe8-43cc-9220-b5d93015cba6"), + (True, 241, "041b8327-222c-4bfe-ae27-1d93c6025428"), # fetches released datasets and genomes with current_only=1 (default) - (False, 114, "750e67f5-4811-441d-be46-a436786dfb27"), + (False, 100, "08b99cae-d007-4284-b20b-9f222827edb6"), ] ) def test_fetch_genomes_info(self, multi_dbs, allow_unreleased, output_count, expected_genome_uuid): - # FIXME This test takes ages, and generate a lot of unitary queries. SqlAlchemy results needs review before - # moving to 2000 conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_genome_metadata'].dbc.url, taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) test = conn.fetch_genomes_info( @@ -425,4 +423,4 @@ def test_fetch_genomes_info(self, multi_dbs, allow_unreleased, output_count, exp ) output_to_list = list(test) assert len(output_to_list) == output_count - assert output_to_list[0][0]['genome'].Genome.genome_uuid == expected_genome_uuid + assert output_to_list[0][0]['genome'].genome_uuid == expected_genome_uuid