Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions src/ensembl/production/metadata/api/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sqlalchemy as db
import logging

import sqlalchemy as db
from sqlalchemy.orm import aliased
from ensembl.database import DBConnection
from ensembl.ncbi_taxonomy.models import NCBITaxaName

from ensembl.production.metadata.api.base import BaseAdaptor, check_parameter
from ensembl.production.metadata.api.models import Genome, Organism, Assembly, OrganismGroup, OrganismGroupMember, \
GenomeRelease, EnsemblRelease, EnsemblSite, AssemblySequence, GenomeDataset, Dataset, DatasetType, DatasetSource
import logging

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -340,3 +340,41 @@ def fetch_genomes_info(
yield [{'genome': genome, 'datasets': dataset}]
except Exception as e:
raise ValueError(str(e))

def fetch_organisms_group_counts(self, release_version=None, group_code='popular'):
o_species = aliased(Organism)
o = aliased(Organism)
if not release_version:
# Get latest released organisms
query = db.select(
o_species.species_taxonomy_id,
o_species.ensembl_name,
o_species.common_name,
o_species.scientific_name,
OrganismGroupMember.order.label('order'),
db.func.count().label('count')
)

query = query.join(o, o_species.species_taxonomy_id == o.species_taxonomy_id)
query = query.join(Genome, o.organism_id == Genome.organism_id)
query = query.join(Assembly, Genome.assembly_id == Assembly.assembly_id)
query = query.join(OrganismGroupMember, o_species.organism_id == OrganismGroupMember.organism_id)
query = query.join(OrganismGroup,
OrganismGroupMember.organism_group_id == OrganismGroup.organism_group_id)
query = query.filter(OrganismGroup.code == group_code)
query = query.group_by(
o_species.species_taxonomy_id,
o_species.ensembl_name,
o_species.common_name,
o_species.scientific_name,
OrganismGroupMember.order
)
query = query.order_by(OrganismGroupMember.order)
else:
# change group to release_version_state and related genomes
raise NotImplementedError('Not implemented yet')
pass

with self.metadata_db.session_scope() as session:
# TODO check if we should return a dictionary instead
return session.execute(query).all()
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
5 ec1c4b53-c2ef-431c-ad0e-b2aef19b44f1 \N GCA_900519105.1 chromosome \N IWGSC \N IWGSC \N 2023-05-12 13:32:36 IWGSC \N 0 \N
6 7e8ed3a8-d724-4cba-92e1-e968719b7a18 \N GCA_000146045.2 chromosome \N R64-1-1 \N R64-1-1 \N 2023-05-12 13:32:46 R64-1-1 \N 0 \N
7 f7de35c9-e0e8-4e81-b186-2962098d6361 \N GCA_000002985.3 chromosome \N WBcel235 \N WBcel235 \N 2023-05-12 13:32:52 WBcel235 \N 0 \N
8 eeaaa233-151c-4848-8b85-a05a9993101e \N GCA_000001499.28 chromosome 1 GRCh38 t2t \N GRCh38 t2t \N 2023-09-07 14:30:58 GRCh38_t2t \N 1 \N
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
5 a73357ab-93e7-11ec-a39d-005056b38ce3 5 4 2023-05-12 13:32:36
6 a733574a-93e7-11ec-a39d-005056b38ce3 6 5 2023-05-12 13:32:46
7 a733550b-93e7-11ec-a39d-005056b38ce3 7 6 2023-05-12 13:32:52
8 a7335667-93e7-11ec-a39d-00aasab38ce3 8 1 2023-09-07 16:30:58

Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
4 d64c34ca-b37a-476b-83b5-f21d07a3ae67 4565 4565 Triticum aestivum reference (Chinese spring) Triticum aestivum Triticum_aestivum triticum_aestivum \N
5 0dc46f87-0b61-403a-8cd3-86b7e0cce8f0 559292 4932 Saccharomyces cerevisiae S288C Saccharomyces cerevisiae S288c Saccharomyces_cerevisiae saccharomyces_cerevisiae \N
6 0f4aad7b-db15-4a72-af1e-82bbae54226 6239 6239 Caenorhabditis elegans (PRJNA13758) N2 Caenorhabditis elegans Caenorhabditis_elegans caenorhabditis_elegans \N
7 dbbsaf09-2db8-429b-a407-c15a4ca2876d 9606 9606 Human T2T \N Homo sapiens Homo_sapiens homo_sapiens_t2t \N
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
3 Division EnsemblProtists protists
4 Division EnsemblPlants plants
5 Division EnsemblFungi fungi
6 Division EnsemblMetazoa metazoa
6 Division EnsemblMetazoa metazoa
7 Internal Popular Species popular
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,10 @@
3 0 \N 3 3
4 0 \N 4 4
5 0 \N 5 5
6 0 \N 6 6
6 0 \N 6 6
7 0 1 7 1
8 0 2 7 2
9 0 3 7 3
10 0 4 7 4
11 0 5 7 5
12 0 6 7 6
12 changes: 12 additions & 0 deletions src/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,15 @@ def test_fetch_genome_info(self, multi_dbs):
# test = conn.fetch_genomes_info(genome_uuid=uuid)
# assert test['genome'][0].genome_uuid == uuid
# assert test['datasets'][0][0].genome_uuid == uuid

def test_popular_species(self, multi_dbs):
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_organisms_group_counts()
# First result should be Human
assert test[0][2] == 'Human'
# We should have three assemblies associated with Human (Two for grch37.38 organism + one t2t)
assert test[0][5] == 3
for data in test[1:]:
# All others have only one genome in test DB
assert data[5] == 1