Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,18 +1,11 @@
# .gitlab-ci.yml
image: python:3.11

variables:
MYSQL_ROOT_PASSWORD: ""
MYSQL_ALLOW_EMPTY_PASSWORD: "yes"

services:
- mysql:8.0

stages:
- test

before_script:
- mysql -h mysql -u root -e "SET GLOBAL local_infile=1;"
- python -m pip install --upgrade pip
- pip install .[test]

Expand All @@ -24,7 +17,7 @@ test:
image: python:${PYTHON_VERSION}
script:
- echo "DB_HOST $METADATA_URI $TAXONOMY_URI"
- coverage run -m pytest -c pyproject.toml --server mysql://root@mysql:3306
- coverage run -m pytest -c pyproject.toml
coverage: '/TOTAL.*\s+(\d+%)$/'
artifacts:
reports:
Expand Down
8 changes: 1 addition & 7 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,8 @@ dist: focal
python:
- '3.10'
- '3.11'
services:
- mysql
before_script:
# In MySQL 8, local_infile is disabled by default for security reasons.
# By adding SET GLOBAL local_infile=1;, we enable this feature at runtime.
- mysql -e "SET GLOBAL local_infile=1;"
- pip install .
- pip install .[test]
script:
- echo "DB_HOST $METADATA_URI $TAXONOMY_URI"
- coverage run -m pytest -c pyproject.toml --server mysql://travis@127.0.0.1:3306
- coverage run -m pytest -c pyproject.toml
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ dependencies = [
"duckdb-engine >= 0.17.0",
"pymysql",
"mysqlclient",
"pydantic"
]

[project.urls]
Expand Down
5 changes: 0 additions & 5 deletions src/ensembl/production/metadata/api/adaptors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,6 @@
from ensembl.production.metadata.grpc.config import cfg


##Todo: Add in OrganismAdapator. Subfunction fetches all organism in popular group. and # of genomes from distinct assemblies.
# Add in best genome (see doc)
# More functions for related genomes


class BaseAdaptor:
def __init__(self, metadata_uri):
self.metadata_db = DBConnection(metadata_uri, pool_size=cfg.pool_size, pool_recycle=cfg.pool_recycle)
Expand Down
165 changes: 149 additions & 16 deletions src/ensembl/production/metadata/api/adaptors/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,14 @@
from typing import List, Tuple, NamedTuple

import sqlalchemy as db
from ensembl.ncbi_taxonomy.models import NCBITaxaName
from ensembl.utils.database import DBConnection
from sqlalchemy import select, func, desc, or_, distinct, case
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import aliased

from ensembl.production.metadata.api.adaptors.base import BaseAdaptor, check_parameter, cfg
from ensembl.production.metadata.api.exceptions import TypeNotFoundException
from ensembl.production.metadata.api.models import Genome, Organism, Assembly, OrganismGroup, OrganismGroupMember, \
GenomeRelease, EnsemblRelease, EnsemblSite, AssemblySequence, GenomeDataset, Dataset, DatasetType, DatasetSource, \
ReleaseStatus, DatasetStatus, utils, DatasetAttribute, Attribute
from ensembl.production.metadata.api.models import *

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -149,17 +146,37 @@ def fetch_genomes_by_assembly_name_genebuild(self,
session.expire_on_commit = False
return session.execute(genome_select).all()

def fetch_genomes(self, genome_id=None, genome_uuid=None, genome_tag=None, organism_uuid=None, assembly_uuid=None,
assembly_accession=None, assembly_name=None, use_default_assembly=False, biosample_id=None,
production_name=None, taxonomy_id=None, group=None, unreleased_only=False, site_name=None,
release_type=None, release_version=None, current_only=False):
def fetch_genomes(
self,
genome_id=None,
genome_uuid=None,
genome_tag=None,
organism_uuid=None,
assembly_uuid=None,
assembly_accession=None,
assembly_name=None,
use_default_assembly=False,
biosample_id=None,
production_name=None,
taxonomy_id=None,
group=None,
genome_group_id=None,
genome_group_name=None,
genome_group_type=None,
genome_group_reference_only=False,
unreleased_only=False,
site_name=None,
release_type=None,
release_version=None,
current_only=False,
):
"""
Fetches genome information based on the specified parameters.

Args:
genome_id (Union[int, List[int]]): The ID(s) of the genome(s) to fetch.
genome_uuid str|None: The UUID of the genome to fetch.
genome_tag (Union[str, List[str]]): genome_tag value is either in Assembly.url_name or told_id.
genome_tag (Union[str, List[str]]): genome_tag value is genome.url_name
organism_uuid (Union[str, List[str]]): The UUID(s) of the organism(s) to fetch.
assembly_uuid (Union[str, List[str]]): The UUID(s) of the assembly(s) to fetch.
assembly_accession (Union[str, List[str]]): The assenbly accession of the assembly(s) to fetch.
Expand Down Expand Up @@ -221,6 +238,32 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, genome_tag=None, organ
.join(OrganismGroupMember.organism_group) \
.filter(OrganismGroup.name.in_(group) | OrganismGroup.code.in_(group))

# genome group logic
if genome_group_id or genome_group_name or genome_group_type or genome_group_reference_only:
genome_select = genome_select.join(
GenomeGroupMember, Genome.genome_id == GenomeGroupMember.genome_id
).join(
GenomeGroup, GenomeGroup.genome_group_id == GenomeGroupMember.genome_group_id
)

if genome_group_id:
genome_group_id = check_parameter(genome_group_id)
genome_select = genome_select.where(GenomeGroup.genome_group_id.in_(genome_group_id))

if genome_group_name:
genome_group_name = check_parameter(genome_group_name)
genome_select = genome_select.where(GenomeGroup.name.in_(genome_group_name))

if genome_group_type:
genome_group_type = check_parameter(genome_group_type)
genome_select = genome_select.where(GenomeGroup.type.in_(genome_group_type))

if genome_group_reference_only:
genome_select = genome_select.where(GenomeGroupMember.is_reference == 1)

if current_only:
genome_select = genome_select.where(GenomeGroupMember.is_current == 1)

# Apply additional filters based on the provided parameters
if genome_id is not None:
genome_select = genome_select.filter(Genome.genome_id.in_(genome_id))
Expand All @@ -229,12 +272,7 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, genome_tag=None, organ
genome_select = genome_select.filter(Genome.genome_uuid == genome_uuid)

if genome_tag is not None:
genome_select = genome_select.filter(
db.or_(
Assembly.url_name.in_(genome_tag),
Assembly.tol_id.in_(genome_tag)
)
)
genome_select = genome_select.filter(Genome.url_name.in_(genome_tag))

if organism_uuid is not None:
genome_select = genome_select.filter(Organism.organism_uuid.in_(organism_uuid))
Expand Down Expand Up @@ -875,7 +913,102 @@ def fetch_assemblies_count(self, species_taxonomy_id: int, release_version: floa
with self.metadata_db.session_scope() as session:
return session.execute(query).scalar()

def get_public_path(self, genome_uuid, dataset_type='all', release=None):
def fetch_genome_groups(
self, genome_id=None, genome_uuid=None, group_type=None, is_current=True, release_version=None
):
"""
Fetch all genome groups that a genome belongs to.

Note: This is the inverse of filtering by genome_group in fetch_genomes().
"""

query = select(GenomeGroup).join(
GenomeGroupMember, GenomeGroup.genome_group_id == GenomeGroupMember.genome_group_id
).join(
Genome, Genome.genome_id == GenomeGroupMember.genome_id
)

if genome_id:
genome_id = check_parameter(genome_id)
query = query.where(Genome.genome_id.in_(genome_id))

elif genome_uuid:
genome_uuid = check_parameter(genome_uuid)
query = query.where(Genome.genome_uuid.in_(genome_uuid))

if group_type:
group_type = check_parameter(group_type)
query = query.where(GenomeGroup.type.in_(group_type))

if is_current:
query = query.where(GenomeGroupMember.is_current == 1)

if release_version is not None:
query = query.join(
EnsemblRelease,
EnsemblRelease.release_id == GenomeGroupMember.release_id
).where(EnsemblRelease.version <= release_version)

logger.debug(query)
with self.metadata_db.session_scope() as session:
session.expire_on_commit = False
return session.execute(query).scalars().all()

def fetch_genome_group_members_detailed(
self, genome_group_id=None, group_name=None, is_current=True, release_version=None
):
"""
Fetch genomes and their membership details for a genome group.

This returns both the genome objects and their membership information (is_reference, etc.)

Args:
genome_group_id (Union[int, List[int]]): The ID(s) of the genome group(s).
group_name (Union[str, List[str]]): The name(s) of the genome group(s).
is_current (bool): If True, return only current genome group memberships.
release_version (float): Return memberships up to this release version.

Returns:
List of tuples (Genome, GenomeGroupMember) with full membership details.
"""
member_select = select(Genome, GenomeGroupMember).join(
GenomeGroupMember, Genome.genome_id == GenomeGroupMember.genome_id
).join(
GenomeGroup, GenomeGroup.genome_group_id == GenomeGroupMember.genome_group_id
)

# Apply filters
if genome_group_id:
genome_group_id = check_parameter(genome_group_id)
member_select = member_select.where(GenomeGroup.genome_group_id.in_(genome_group_id))

if group_name:
group_name = check_parameter(group_name)
member_select = member_select.where(GenomeGroup.name.in_(group_name))

if is_current:
member_select = member_select.where(GenomeGroupMember.is_current == 1)

# Handle release filtering
if release_version is not None:
member_select = member_select.join(
EnsemblRelease,
EnsemblRelease.release_id == GenomeGroupMember.release_id
).where(EnsemblRelease.version <= release_version)

logger.debug(f"Allow Unreleased {cfg.allow_unreleased}")
if not cfg.allow_unreleased:
member_select = member_select.where(EnsemblRelease.status == ReleaseStatus.RELEASED)

# Order by is_reference descending so reference genomes appear first
member_select = member_select.order_by(desc(GenomeGroupMember.is_reference))

logger.debug(member_select)
with self.metadata_db.session_scope() as session:
session.expire_on_commit = False
return session.execute(member_select).all()

def get_public_path(self, genome_uuid, dataset_type='all'):
paths = []
scientific_name = None
accession = None
Expand Down
Loading