Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 23 additions & 16 deletions src/ensembl/production/metadata/grpc/adaptors/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import List, Tuple

import sqlalchemy as db
from sqlalchemy import desc
from sqlalchemy.orm import aliased
from ensembl.database import DBConnection
from ensembl.ncbi_taxonomy.models import NCBITaxaName
Expand Down Expand Up @@ -382,9 +383,9 @@ def fetch_sequences(self, genome_id=None, genome_uuid=None, assembly_uuid=None,
list: A list of fetched sequences.
"""
genome_id = check_parameter(genome_id)
genome_uuid = check_parameter(genome_uuid)
# genome_uuid = check_parameter(genome_uuid)
assembly_uuid = check_parameter(assembly_uuid)
assembly_accession = check_parameter(assembly_accession)
# assembly_accession = check_parameter(assembly_accession)
assembly_sequence_accession = check_parameter(assembly_sequence_accession)
assembly_sequence_name = check_parameter(assembly_sequence_name)

Expand Down Expand Up @@ -416,10 +417,10 @@ def fetch_sequences(self, genome_id=None, genome_uuid=None, assembly_uuid=None,

if assembly_sequence_name is not None:
seq_select = seq_select.filter(AssemblySequence.name == assembly_sequence_name)

logger.debug(f'Query {seq_select}')
with self.metadata_db.session_scope() as session:
session.expire_on_commit = False
return session.execute(seq_select).all()
return session.execute(seq_select.order_by(AssemblySequence.accession)).all()

def fetch_sequences_by_genome_uuid(self, genome_uuid, chromosomal_only=False):
return self.fetch_sequences(
Expand Down Expand Up @@ -493,7 +494,7 @@ def fetch_genome_datasets(self, genome_id: (int | List[int]) = None,
.join(Dataset, GenomeDataset.dataset_id == Dataset.dataset_id) \
.join(DatasetType, Dataset.dataset_type_id == DatasetType.dataset_type_id) \
.join(DatasetSource, Dataset.dataset_source_id == DatasetSource.dataset_source_id).order_by(
Genome.genome_uuid, Dataset.dataset_uuid)
Genome.genome_uuid, Dataset.dataset_uuid).distinct()

if genome_id is not None:
logger.debug(f"Filter on genome_id {genome_id}")
Expand Down Expand Up @@ -598,13 +599,13 @@ def fetch_genomes_info(

):
try:
genome_id = check_parameter(genome_id)
genome_uuid = check_parameter(genome_uuid)
# genome_id = check_parameter(genome_id)
# genome_uuid = check_parameter(genome_uuid)
ensembl_name = check_parameter(ensembl_name)
group = check_parameter(group)
group_type = check_parameter(group_type)
dataset_type_name = check_parameter(dataset_type_name)
dataset_source = check_parameter(dataset_source)
# dataset_type_name = check_parameter(dataset_type_name)
# dataset_source = check_parameter(dataset_source)

if group is None:
group_type = group_type if group_type else ['Division']
Expand All @@ -622,18 +623,24 @@ def fetch_genomes_info(
group=group,
group_type=group_type,
)

for genome in genomes:
# FIXME, looping over each genome could be costly, like very costly.
# Better approach could be to pre-fetch related objects
dataset = self.fetch_genome_datasets(
genome_uuid=genome[0].genome_uuid,
genomes_uuids = [genome[0].genome_uuid for genome in genomes]
genomes_datasets = self.fetch_genome_datasets(
genome_uuid=genomes_uuids,
allow_unreleased=allow_unreleased_datasets,
dataset_type_name=dataset_type_name,
dataset_source=dataset_source,
dataset_attributes=dataset_attributes
)
res = [{'genome': genome, 'datasets': dataset}]
agglo = {}
logger.debug(f'genome datasets {genomes_datasets[0]}')
for genome_infos in genomes_datasets:
if genome_infos[0].genome_uuid not in agglo.keys():
agglo[genome_infos[0].genome_uuid] = {'genome': genome_infos[0], 'datasets': []}
if genome_infos[2] not in agglo[genome_infos[0].genome_uuid]['datasets']:
agglo[genome_infos[0].genome_uuid]['datasets'].append(genome_infos[2])
for genome_uuid, data in agglo.items():
logger.debug(f'genome_uuid: {genome_uuid}, datasets {data["datasets"]}')
res = [{'genome': data['genome'], 'datasets': data['datasets']}]
yield res
except Exception as e:
raise ValueError(str(e))
Expand Down
10 changes: 4 additions & 6 deletions src/tests/test_grpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def test_fetch_sequences_chromosomal(self, multi_dbs, genome_uuid, assembly_acce
assembly_accession=assembly_accession,
chromosomal_only=chromosomal_only
)
logger.debug(f"Retrieved {test[0:2]}")
logger.debug(f"Retrieved {test[0]}")
assert test[-1].AssemblySequence.chromosomal == expected_output

@pytest.mark.parametrize(
Expand Down Expand Up @@ -408,14 +408,12 @@ def test_fetch_related_assemblies_count(self, multi_dbs, organism_uuid, expected
"allow_unreleased, output_count, expected_genome_uuid",
[
# fetches everything
(True, 283, "0e1a1b4a-efe8-43cc-9220-b5d93015cba6"),
(True, 241, "041b8327-222c-4bfe-ae27-1d93c6025428"),
# fetches released datasets and genomes with current_only=1 (default)
(False, 114, "750e67f5-4811-441d-be46-a436786dfb27"),
(False, 100, "08b99cae-d007-4284-b20b-9f222827edb6"),
]
)
def test_fetch_genomes_info(self, multi_dbs, allow_unreleased, output_count, expected_genome_uuid):
# FIXME This test takes ages, and generate a lot of unitary queries. SqlAlchemy results needs review before
# moving to 2000
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_genome_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_genomes_info(
Expand All @@ -425,4 +423,4 @@ def test_fetch_genomes_info(self, multi_dbs, allow_unreleased, output_count, exp
)
output_to_list = list(test)
assert len(output_to_list) == output_count
assert output_to_list[0][0]['genome'].Genome.genome_uuid == expected_genome_uuid
assert output_to_list[0][0]['genome'].genome_uuid == expected_genome_uuid