diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4d59171d..a703b4ee 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,18 +1,11 @@ # .gitlab-ci.yml image: python:3.11 -variables: - MYSQL_ROOT_PASSWORD: "" - MYSQL_ALLOW_EMPTY_PASSWORD: "yes" - -services: - - mysql:8.0 stages: - test before_script: - - mysql -h mysql -u root -e "SET GLOBAL local_infile=1;" - python -m pip install --upgrade pip - pip install .[test] @@ -24,7 +17,7 @@ test: image: python:${PYTHON_VERSION} script: - echo "DB_HOST $METADATA_URI $TAXONOMY_URI" - - coverage run -m pytest -c pyproject.toml --server mysql://root@mysql:3306 + - coverage run -m pytest -c pyproject.toml coverage: '/TOTAL.*\s+(\d+%)$/' artifacts: reports: diff --git a/.travis.yml b/.travis.yml index e9b54ac1..4a13baad 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,14 +4,8 @@ dist: focal python: - '3.10' - '3.11' -services: - - mysql before_script: - # In MySQL 8, local_infile is disabled by default for security reasons. - # By adding SET GLOBAL local_infile=1;, we enable this feature at runtime. - - mysql -e "SET GLOBAL local_infile=1;" - pip install . - pip install .[test] script: - - echo "DB_HOST $METADATA_URI $TAXONOMY_URI" - - coverage run -m pytest -c pyproject.toml --server mysql://travis@127.0.0.1:3306 \ No newline at end of file + - coverage run -m pytest -c pyproject.toml \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index f7aedb38..709d109f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ dependencies = [ "duckdb-engine >= 0.17.0", "pymysql", "mysqlclient", + "pydantic" ] [project.urls] diff --git a/src/ensembl/production/metadata/api/adaptors/base.py b/src/ensembl/production/metadata/api/adaptors/base.py index 745360ee..34831b8d 100644 --- a/src/ensembl/production/metadata/api/adaptors/base.py +++ b/src/ensembl/production/metadata/api/adaptors/base.py @@ -14,11 +14,6 @@ from ensembl.production.metadata.grpc.config import cfg -##Todo: Add in OrganismAdapator. Subfunction fetches all organism in popular group. and # of genomes from distinct assemblies. -# Add in best genome (see doc) -# More functions for related genomes - - class BaseAdaptor: def __init__(self, metadata_uri): self.metadata_db = DBConnection(metadata_uri, pool_size=cfg.pool_size, pool_recycle=cfg.pool_recycle) diff --git a/src/ensembl/production/metadata/api/adaptors/genome.py b/src/ensembl/production/metadata/api/adaptors/genome.py index a4d85d03..8e2ee330 100644 --- a/src/ensembl/production/metadata/api/adaptors/genome.py +++ b/src/ensembl/production/metadata/api/adaptors/genome.py @@ -17,7 +17,6 @@ from typing import List, Tuple, NamedTuple import sqlalchemy as db -from ensembl.ncbi_taxonomy.models import NCBITaxaName from ensembl.utils.database import DBConnection from sqlalchemy import select, func, desc, or_, distinct, case from sqlalchemy.exc import NoResultFound @@ -25,9 +24,7 @@ from ensembl.production.metadata.api.adaptors.base import BaseAdaptor, check_parameter, cfg from ensembl.production.metadata.api.exceptions import TypeNotFoundException -from ensembl.production.metadata.api.models import Genome, Organism, Assembly, OrganismGroup, OrganismGroupMember, \ - GenomeRelease, EnsemblRelease, EnsemblSite, AssemblySequence, GenomeDataset, Dataset, DatasetType, DatasetSource, \ - ReleaseStatus, DatasetStatus, utils, DatasetAttribute, Attribute +from ensembl.production.metadata.api.models import * logger = logging.getLogger(__name__) @@ -149,17 +146,37 @@ def fetch_genomes_by_assembly_name_genebuild(self, session.expire_on_commit = False return session.execute(genome_select).all() - def fetch_genomes(self, genome_id=None, genome_uuid=None, genome_tag=None, organism_uuid=None, assembly_uuid=None, - assembly_accession=None, assembly_name=None, use_default_assembly=False, biosample_id=None, - production_name=None, taxonomy_id=None, group=None, unreleased_only=False, site_name=None, - release_type=None, release_version=None, current_only=False): + def fetch_genomes( + self, + genome_id=None, + genome_uuid=None, + genome_tag=None, + organism_uuid=None, + assembly_uuid=None, + assembly_accession=None, + assembly_name=None, + use_default_assembly=False, + biosample_id=None, + production_name=None, + taxonomy_id=None, + group=None, + genome_group_id=None, + genome_group_name=None, + genome_group_type=None, + genome_group_reference_only=False, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=False, + ): """ Fetches genome information based on the specified parameters. Args: genome_id (Union[int, List[int]]): The ID(s) of the genome(s) to fetch. genome_uuid str|None: The UUID of the genome to fetch. - genome_tag (Union[str, List[str]]): genome_tag value is either in Assembly.url_name or told_id. + genome_tag (Union[str, List[str]]): genome_tag value is genome.url_name organism_uuid (Union[str, List[str]]): The UUID(s) of the organism(s) to fetch. assembly_uuid (Union[str, List[str]]): The UUID(s) of the assembly(s) to fetch. assembly_accession (Union[str, List[str]]): The assenbly accession of the assembly(s) to fetch. @@ -221,6 +238,32 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, genome_tag=None, organ .join(OrganismGroupMember.organism_group) \ .filter(OrganismGroup.name.in_(group) | OrganismGroup.code.in_(group)) + # genome group logic + if genome_group_id or genome_group_name or genome_group_type or genome_group_reference_only: + genome_select = genome_select.join( + GenomeGroupMember, Genome.genome_id == GenomeGroupMember.genome_id + ).join( + GenomeGroup, GenomeGroup.genome_group_id == GenomeGroupMember.genome_group_id + ) + + if genome_group_id: + genome_group_id = check_parameter(genome_group_id) + genome_select = genome_select.where(GenomeGroup.genome_group_id.in_(genome_group_id)) + + if genome_group_name: + genome_group_name = check_parameter(genome_group_name) + genome_select = genome_select.where(GenomeGroup.name.in_(genome_group_name)) + + if genome_group_type: + genome_group_type = check_parameter(genome_group_type) + genome_select = genome_select.where(GenomeGroup.type.in_(genome_group_type)) + + if genome_group_reference_only: + genome_select = genome_select.where(GenomeGroupMember.is_reference == 1) + + if current_only: + genome_select = genome_select.where(GenomeGroupMember.is_current == 1) + # Apply additional filters based on the provided parameters if genome_id is not None: genome_select = genome_select.filter(Genome.genome_id.in_(genome_id)) @@ -229,12 +272,7 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, genome_tag=None, organ genome_select = genome_select.filter(Genome.genome_uuid == genome_uuid) if genome_tag is not None: - genome_select = genome_select.filter( - db.or_( - Assembly.url_name.in_(genome_tag), - Assembly.tol_id.in_(genome_tag) - ) - ) + genome_select = genome_select.filter(Genome.url_name.in_(genome_tag)) if organism_uuid is not None: genome_select = genome_select.filter(Organism.organism_uuid.in_(organism_uuid)) @@ -875,7 +913,102 @@ def fetch_assemblies_count(self, species_taxonomy_id: int, release_version: floa with self.metadata_db.session_scope() as session: return session.execute(query).scalar() - def get_public_path(self, genome_uuid, dataset_type='all', release=None): + def fetch_genome_groups( + self, genome_id=None, genome_uuid=None, group_type=None, is_current=True, release_version=None + ): + """ + Fetch all genome groups that a genome belongs to. + + Note: This is the inverse of filtering by genome_group in fetch_genomes(). + """ + + query = select(GenomeGroup).join( + GenomeGroupMember, GenomeGroup.genome_group_id == GenomeGroupMember.genome_group_id + ).join( + Genome, Genome.genome_id == GenomeGroupMember.genome_id + ) + + if genome_id: + genome_id = check_parameter(genome_id) + query = query.where(Genome.genome_id.in_(genome_id)) + + elif genome_uuid: + genome_uuid = check_parameter(genome_uuid) + query = query.where(Genome.genome_uuid.in_(genome_uuid)) + + if group_type: + group_type = check_parameter(group_type) + query = query.where(GenomeGroup.type.in_(group_type)) + + if is_current: + query = query.where(GenomeGroupMember.is_current == 1) + + if release_version is not None: + query = query.join( + EnsemblRelease, + EnsemblRelease.release_id == GenomeGroupMember.release_id + ).where(EnsemblRelease.version <= release_version) + + logger.debug(query) + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + return session.execute(query).scalars().all() + + def fetch_genome_group_members_detailed( + self, genome_group_id=None, group_name=None, is_current=True, release_version=None + ): + """ + Fetch genomes and their membership details for a genome group. + + This returns both the genome objects and their membership information (is_reference, etc.) + + Args: + genome_group_id (Union[int, List[int]]): The ID(s) of the genome group(s). + group_name (Union[str, List[str]]): The name(s) of the genome group(s). + is_current (bool): If True, return only current genome group memberships. + release_version (float): Return memberships up to this release version. + + Returns: + List of tuples (Genome, GenomeGroupMember) with full membership details. + """ + member_select = select(Genome, GenomeGroupMember).join( + GenomeGroupMember, Genome.genome_id == GenomeGroupMember.genome_id + ).join( + GenomeGroup, GenomeGroup.genome_group_id == GenomeGroupMember.genome_group_id + ) + + # Apply filters + if genome_group_id: + genome_group_id = check_parameter(genome_group_id) + member_select = member_select.where(GenomeGroup.genome_group_id.in_(genome_group_id)) + + if group_name: + group_name = check_parameter(group_name) + member_select = member_select.where(GenomeGroup.name.in_(group_name)) + + if is_current: + member_select = member_select.where(GenomeGroupMember.is_current == 1) + + # Handle release filtering + if release_version is not None: + member_select = member_select.join( + EnsemblRelease, + EnsemblRelease.release_id == GenomeGroupMember.release_id + ).where(EnsemblRelease.version <= release_version) + + logger.debug(f"Allow Unreleased {cfg.allow_unreleased}") + if not cfg.allow_unreleased: + member_select = member_select.where(EnsemblRelease.status == ReleaseStatus.RELEASED) + + # Order by is_reference descending so reference genomes appear first + member_select = member_select.order_by(desc(GenomeGroupMember.is_reference)) + + logger.debug(member_select) + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + return session.execute(member_select).all() + + def get_public_path(self, genome_uuid, dataset_type='all'): paths = [] scientific_name = None accession = None diff --git a/src/ensembl/production/metadata/api/adaptors/release.py b/src/ensembl/production/metadata/api/adaptors/release.py index b1d5ec65..578c91a1 100644 --- a/src/ensembl/production/metadata/api/adaptors/release.py +++ b/src/ensembl/production/metadata/api/adaptors/release.py @@ -15,45 +15,87 @@ from typing import List import sqlalchemy as db +from sqlalchemy import and_ -from ensembl.production.metadata.api.models import EnsemblRelease, EnsemblSite, GenomeRelease, Genome, GenomeDataset, \ - Dataset, ReleaseStatus from ensembl.production.metadata.api.adaptors.base import check_parameter, BaseAdaptor, cfg +from ensembl.production.metadata.api.models import ( + EnsemblRelease, + EnsemblSite, + GenomeRelease, + Genome, + GenomeDataset, + Dataset, + ReleaseStatus, +) logger = logging.getLogger(__name__) -def filter_release_status(query, - release_status: str | ReleaseStatus = None): +def filter_release_status(query, release_status: str | ReleaseStatus = None): + """ + Adds EnsemblSite join and filters based on release status and configuration. + + Args: + query: The SQLAlchemy query to filter + release_status: Optional release status to filter by + + Returns: + Modified query with site join and status filters applied + """ logger.debug(f"Allowed unreleased {cfg.allow_unreleased}") query = query.add_columns(EnsemblSite) + if not cfg.allow_unreleased: - query = query.join(EnsemblSite, - EnsemblSite.site_id == EnsemblRelease.site_id & - EnsemblSite.site_id == cfg.ensembl_site_id) \ - .filter(EnsemblRelease.status == ReleaseStatus.RELEASED) + # For released only: use inner join and filter + query = query.join( + EnsemblSite, + and_(EnsemblSite.site_id == EnsemblRelease.site_id, EnsemblSite.site_id == cfg.ensembl_site_id), + ).filter(EnsemblRelease.status == ReleaseStatus.RELEASED) else: - query = query.outerjoin(EnsemblSite, - EnsemblSite.site_id == EnsemblRelease.site_id & - EnsemblSite.site_id == cfg.ensembl_site_id) - # Release status filter only work when unreleased are allowed + # For unreleased allowed: use outer join + query = query.outerjoin( + EnsemblSite, + and_(EnsemblSite.site_id == EnsemblRelease.site_id, EnsemblSite.site_id == cfg.ensembl_site_id), + ) + # Release status filter only works when unreleased are allowed if release_status: if isinstance(release_status, str): release_status = ReleaseStatus(release_status) query = query.filter(EnsemblRelease.status == release_status) + return query +def _ensure_scalar(value): + """ + Ensures a parameter is a scalar value, unwrapping single-element lists. + Handles pytest parametrization edge cases. + + Args: + value: The value to check + + Returns: + Scalar value or None + """ + + if isinstance(value, (list, tuple)) and len(value) == 1: + value = value[0] + + return value + + class ReleaseAdaptor(BaseAdaptor): - def fetch_releases(self, - release_id: int | List[int] = None, - release_version: float | List[float] = None, - current_only: bool = False, - site_name: str = None, - release_type: str = None, - release_label: str = None, - release_status: str | ReleaseStatus = None): + def fetch_releases( + self, + release_id: int | List[int] = None, + release_version: float | List[float] = None, + current_only: bool = False, + site_name: str = None, + release_type: str = None, + release_label: str = None, + release_status: str | ReleaseStatus = None, + ): """ Fetches releases based on the provided parameters. @@ -61,7 +103,7 @@ def fetch_releases(self, release_id: release internal id (int or list[int]) release_version (float or list or None): Release version(s) to filter by. current_only (bool): Flag indicating whether to fetch only current releases. - site_name (str): SIte name to filter by. + site_name (str): Site name to filter by. release_type (str): Release type to filter by. release_label (str): Release label to filter by. release_status: whether to filter particular release status @@ -73,71 +115,96 @@ def fetch_releases(self, releases_id = check_parameter(release_id) if releases_id is not None: - release_select = release_select.filter( - EnsemblRelease.release_id.in_(releases_id) - ) + release_select = release_select.filter(EnsemblRelease.release_id.in_(releases_id)) - release_version = check_parameter(release_version) - # WHERE ensembl_release.version < version + # Handle release_version parameter + # Ensure it's a scalar for <= comparison, or list for IN clause + release_version = _ensure_scalar(check_parameter(release_version)) if release_version is not None: - release_select = release_select.filter( - EnsemblRelease.version <= release_version - ) - # WHERE ensembl_release.is_current =:is_current_1 + if isinstance(release_version, (list, tuple)): + # Multiple versions: use IN clause + release_select = release_select.filter(EnsemblRelease.version.in_(release_version)) + else: + # Single version: use <= comparison + # Convert to float to ensure type compatibility with SQLite + release_version = float(release_version) + release_select = release_select.filter(EnsemblRelease.version <= release_version) + if current_only: - release_select = release_select.filter( - EnsemblRelease.is_current == 1 - ) + release_select = release_select.filter(EnsemblRelease.is_current == 1) - # WHERE ensembl_release.release_type = :release_type_1 if release_type is not None: - release_select = release_select.filter( - EnsemblRelease.release_type.in_(release_type) - ) + release_type = check_parameter(release_type) + release_select = release_select.filter(EnsemblRelease.release_type.in_(release_type)) if release_label is not None: - release_select = release_select.filter( - EnsemblRelease.label.in_(release_label) - ) + release_label = check_parameter(release_label) + release_select = release_select.filter(EnsemblRelease.label.in_(release_label)) + # Filter by site name (requires site join, so must come before filter_release_status) if site_name is not None: - release_select = release_select.filter( - EnsemblSite.name.in_(site_name) - ) + site_name = check_parameter(site_name) + release_select = release_select.filter(EnsemblSite.name.in_(site_name)) - release_select = release_select.filter( - EnsemblSite.site_id == cfg.ensembl_site_id - ) + # Add site join and status filters + # NOTE: This already handles the site_id == cfg.ensembl_site_id filter release_select = filter_release_status(release_select, release_status) + logger.debug("Query: %s ", release_select) + with self.metadata_db.session_scope() as session: session.expire_on_commit = False return session.execute(release_select).all() def fetch_releases_for_genome(self, genome_uuid): + """ + Fetches releases associated with a specific genome. + + Args: + genome_uuid: The UUID of the genome + + Returns: + list: A list of releases for the genome + """ select_released = db.select(EnsemblRelease).join(GenomeRelease) + if not cfg.allow_unreleased: select_released = select_released.filter(EnsemblRelease.status == ReleaseStatus.RELEASED) + select_released = select_released.join(Genome).where(Genome.genome_uuid == genome_uuid) select_released = filter_release_status(select_released) logger.debug("Query: %s ", select_released) + with self.metadata_db.session_scope() as session: session.expire_on_commit = False releases = session.execute(select_released).all() return releases def fetch_releases_for_dataset(self, dataset_uuid): - select_released = db.select(EnsemblRelease) \ - .select_from(Dataset) \ - .join(GenomeDataset) \ - .join(EnsemblRelease) \ + """ + Fetches releases associated with a specific dataset. + + Args: + dataset_uuid: The UUID of the dataset + + Returns: + list: A list of releases for the dataset + """ + select_released = ( + db.select(EnsemblRelease) + .select_from(Dataset) + .join(GenomeDataset) + .join(EnsemblRelease) .where(Dataset.dataset_uuid == dataset_uuid) + ) if not cfg.allow_unreleased: select_released = select_released.filter(EnsemblRelease.status == ReleaseStatus.RELEASED) + select_released = filter_release_status(select_released) logger.debug("Query: %s ", select_released) + with self.metadata_db.session_scope() as session: session.expire_on_commit = False releases = session.execute(select_released).all() diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py index aa5c75c0..66624b01 100644 --- a/src/ensembl/production/metadata/api/factories/datasets.py +++ b/src/ensembl/production/metadata/api/factories/datasets.py @@ -391,21 +391,20 @@ def attach_misc_datasets(self, release_id, session=None, force=False): for child_uuid in child_uuids: dataset_obj = session.query(Dataset).filter(Dataset.dataset_uuid == child_uuid).one() - # Skip if dataset is FAULTY or RELEASED if dataset_obj.status in (DatasetStatus.FAULTY, DatasetStatus.RELEASED): - continue # ✅ Skip updating or inserting for this dataset + continue - # Check if GenomeDataset exists for this dataset & genome - genome_dataset = session.query(GenomeDataset).filter( + genome_dataset = session.query(GenomeDataset).outerjoin( + EnsemblRelease, GenomeDataset.release_id == EnsemblRelease.release_id + ).filter( GenomeDataset.dataset_id == dataset_obj.dataset_id, - GenomeDataset.genome_id == genome_id + GenomeDataset.genome_id == genome_id, + (EnsemblRelease.release_type != "integrated") | (GenomeDataset.release_id.is_(None)) ).one_or_none() if genome_dataset: - # ✅ Update release_id even if it was attached to a previous release genome_dataset.release_id = release_id else: - # ✅ If it doesn’t exist, create a new one new_gd = GenomeDataset( genome_id=genome_id, dataset=dataset_obj, @@ -686,15 +685,6 @@ def query_all_child_datasets(self, parent_dataset_uuid, session=None): all_child_datasets.extend(sub_children) return all_child_datasets - def __query_depends_on(self, session, dataset_uuid): - dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one_or_none() - dataset_type = dataset.dataset_type - dependent_types = dataset_type.depends_on.split(',') if dataset_type.depends_on else [] - dependent_datasets_info = [] - for dtype in dependent_types: - new_uuid, new_status = self.__query_related_genome_by_type(session, dataset_uuid, dtype) - dependent_datasets_info.append((new_uuid, new_status)) - return dependent_datasets_info def __update_status(self, session, dataset_uuid, status): # Processed to Released. Only accept top level. Check that all assembly and genebuild datsets (all the way down) are processed. @@ -720,10 +710,6 @@ def __update_status(self, session, dataset_uuid, status): if current_dataset.status == DatasetStatus.RELEASED: # "Released": # and it is not top level. return updated_datasets # Check the dependents - dependents = self.__query_depends_on(session, dataset_uuid) - for uuid, dep_status in dependents: - if dep_status not in (DatasetStatus.PROCESSED, DatasetStatus.RELEASED): # ("Processed", "Released"): - return updated_datasets current_dataset.status = DatasetStatus.PROCESSING # "Processing" parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: diff --git a/src/ensembl/production/metadata/api/factories/genomes.py b/src/ensembl/production/metadata/api/factories/genomes.py index 9f7f4cf6..ffeaf42f 100644 --- a/src/ensembl/production/metadata/api/factories/genomes.py +++ b/src/ensembl/production/metadata/api/factories/genomes.py @@ -11,9 +11,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -''' +""" Fetch Genome Info From New Metadata Database -''' +""" import argparse import json @@ -30,7 +30,7 @@ from ensembl.production.metadata.api.models.genome import Genome, GenomeDataset, GenomeRelease from ensembl.production.metadata.api.models.organism import Organism, OrganismGroup, OrganismGroupMember -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) @@ -47,21 +47,56 @@ class GenomeInputFilters: release_id: int = 0 batch_size: int = 50 page: int = 1 - organism_group_type: str = '' + organism_group_type: str = "" run_all: int = 0 update_dataset_status: str = "" update_dataset_attribute: dict = field(default_factory=lambda: {}) - columns: List = field(default_factory=lambda: [Genome.genome_uuid.label('genome_uuid'), - Genome.production_name.label('species'), - Dataset.dataset_uuid.label('dataset_uuid'), - Dataset.status.label('dataset_status'), - DatasetSource.name.label('dataset_source'), - DatasetType.name.label('dataset_type') - ]) + columns: List = field( + default_factory=lambda: [ + Genome.genome_uuid.label("genome_uuid"), + Genome.production_name.label("species"), + Dataset.dataset_uuid.label("dataset_uuid"), + Dataset.status.label("dataset_status"), + DatasetSource.name.label("dataset_source"), + DatasetType.name.label("dataset_type"), + ] + ) @dataclass class GenomeFactory: + + @staticmethod + def _normalize_status_to_enum(status_list): + """ + Convert a list of status strings to DatasetStatus enum values. + This ensures compatibility between SQLite and MySQL. + + Args: + status_list: List of status strings or enums + + Returns: + List of DatasetStatus enum values + """ + if not status_list: + return [] + + normalized = [] + for status in status_list: + if isinstance(status, DatasetStatus): + # Already an enum + normalized.append(status) + elif isinstance(status, str): + # Convert string to enum + try: + normalized.append(DatasetStatus(status)) + except ValueError: + logger.warning(f"Invalid status value: {status}") + else: + logger.warning(f"Unexpected status type: {type(status)} for value {status}") + + return normalized + @staticmethod def _apply_filters(query, filters): @@ -70,12 +105,12 @@ def _apply_filters(query, filters): if filters.run_all: filters.division = [ - 'EnsemblBacteria', - 'EnsemblVertebrates', - 'EnsemblPlants', - 'EnsemblProtists', - 'EnsemblMetazoa', - 'EnsemblFungi', + "EnsemblBacteria", + "EnsemblVertebrates", + "EnsemblPlants", + "EnsemblProtists", + "EnsemblMetazoa", + "EnsemblFungi", ] if filters.genome_uuid: @@ -87,9 +122,11 @@ def _apply_filters(query, filters): if filters.division: ensembl_divisions = filters.division - if filters.organism_group_type == 'DIVISION': - pattern = re.compile(r'^(ensembl)?', re.IGNORECASE) - ensembl_divisions = ['Ensembl' + pattern.sub('', d).capitalize() for d in ensembl_divisions if d] + if filters.organism_group_type == "DIVISION": + pattern = re.compile(r"^(ensembl)?", re.IGNORECASE) + ensembl_divisions = [ + "Ensembl" + pattern.sub("", d).capitalize() for d in ensembl_divisions if d + ] query = query.filter(OrganismGroup.name.in_(ensembl_divisions)) @@ -106,129 +143,220 @@ def _apply_filters(query, filters): if filters.release_id: query = query.join(Genome.genome_releases) - query = query.filter(GenomeDataset.release_id==filters.release_id) - query = query.filter(GenomeRelease.release_id==filters.release_id) + query = query.filter(GenomeDataset.release_id == filters.release_id) + query = query.filter(GenomeRelease.release_id == filters.release_id) if filters.dataset_type: - query = query.filter(Genome.genome_datasets.any(DatasetType.name.in_([filters.dataset_type]))) + query = query.filter(DatasetType.name == filters.dataset_type) if filters.dataset_status: - query = query.filter(Dataset.status.in_(filters.dataset_status)) + status_enums = GenomeFactory._normalize_status_to_enum(filters.dataset_status) + if status_enums: + query = query.filter(Dataset.status.in_(status_enums)) + else: + logger.warning(f"No valid status values to filter on: {filters.dataset_status}") if filters.batch_size: filters.page = filters.page if filters.page > 0 else 1 query = query.offset((filters.page - 1) * filters.batch_size).limit(filters.batch_size) + logger.debug(f"Filter Query {query}") return query def _build_query(self, filters): - query = select(*filters.columns) \ - .select_from(Genome) \ - .join(Genome.assembly) \ - .join(Genome.organism) \ - .join(Organism.organism_group_members) \ - .join(OrganismGroupMember.organism_group) \ - .join(Genome.genome_datasets) \ - .join(GenomeDataset.dataset) \ - .join(Dataset.dataset_source) \ - .join(Dataset.dataset_type) \ - .group_by(Genome.genome_id, Dataset.dataset_id) \ + query = ( + select(*filters.columns) + .select_from(Genome) + .join(Genome.assembly) + .join(Genome.organism) + .join(Organism.organism_group_members) + .join(OrganismGroupMember.organism_group) + .join(Genome.genome_datasets) + .join(GenomeDataset.dataset) + .join(Dataset.dataset_source) + .join(Dataset.dataset_type) + .group_by(Genome.genome_id, Dataset.dataset_id) .order_by(Genome.genome_uuid) + ) return self._apply_filters(query, filters) def get_genomes(self, **filters: GenomeInputFilters): filters = GenomeInputFilters(**filters) - logger.info(f'Get Genomes with filters {filters}') + logger.info(f"Get Genomes with filters {filters}") with DBConnection(filters.metadata_db_uri).session_scope() as session: query = self._build_query(filters) - logger.info(f'Executing SQL query: {query}') - for genome in session.execute(query).fetchall(): + logger.info(f"Executing SQL query: {query}") + + results = session.execute(query).fetchall() + logger.debug(f"Query returned {len(results)} results") + + for genome in results: genome_info = genome._asdict() - dataset_uuid = genome_info.get('dataset_uuid', None) + dataset_uuid = genome_info.get("dataset_uuid", None) - # convert status enum object to string value - dataset_status = genome_info.get('dataset_status', None) + dataset_status = genome_info.get("dataset_status", None) if dataset_status and isinstance(dataset_status, DatasetStatus): - genome_info['dataset_status'] = dataset_status.value + genome_info["dataset_status"] = dataset_status.value if not dataset_uuid: - logger.warning( - f"No dataset uuid found for genome {genome_info} skipping this genome " - ) + logger.warning(f"No dataset uuid found for genome {genome_info} skipping this genome ") continue if filters.update_dataset_status: - _, status = DatasetFactory(filters.metadata_db_uri) \ - .update_dataset_status(dataset_uuid, - filters.update_dataset_status, - session=session) - if filters.update_dataset_status == status.value: + update_status = filters.update_dataset_status + if isinstance(update_status, str): + try: + update_status_enum = DatasetStatus(update_status) + except ValueError: + logger.error(f"Invalid update_dataset_status: {update_status}") + genome_info["updated_dataset_status"] = None + yield genome_info + continue + else: + update_status_enum = update_status + + _, status = DatasetFactory(filters.metadata_db_uri).update_dataset_status( + dataset_uuid, update_status_enum.value, session=session + ) + + if update_status_enum == status: logger.info( f"Updated Dataset status for dataset uuid: {dataset_uuid} from " - f"{filters.update_dataset_status} to {status} for genome {genome_info['genome_uuid']}" + f"{genome_info.get('dataset_status')} to {status.value} " + f"for genome {genome_info['genome_uuid']}" ) - genome_info['updated_dataset_status'] = status.value - + genome_info["updated_dataset_status"] = status.value else: logger.warning( f"Cannot update status for dataset uuid: {dataset_uuid} " - f"{filters.update_dataset_status} to {status} for genome {genome_info['genome_uuid']}" + f"from {genome_info.get('dataset_status')} to {status.value} " + f"for genome {genome_info['genome_uuid']}" ) - genome_info['updated_dataset_status'] = None + genome_info["updated_dataset_status"] = None + session.flush() yield genome_info def main(): parser = argparse.ArgumentParser( - prog='genomes.py', - description='Fetch Ensembl genome info from the new metadata database' + prog="genomes.py", description="Fetch Ensembl genome info from the new metadata database" + ) + parser.add_argument( + "--genome_uuid", + type=str, + nargs="*", + default=[], + required=False, + help="List of genome UUIDs to filter the query. Default is an empty list.", + ) + parser.add_argument( + "--dataset_uuid", + type=str, + nargs="*", + default=[], + required=False, + help="List of dataset UUIDs to filter the query. Default is an empty list.", + ) + parser.add_argument( + "--organism_group_type", + type=str, + default="DIVISION", + required=False, + help='Organism group type to filter the query. Default is "DIVISION"', + ) + parser.add_argument( + "--division", + type=str, + nargs="*", + default=[], + required=False, + help="List of organism group names to filter the query. Default is an empty list.", + ) + parser.add_argument( + "--dataset_type", + type=str, + default="assembly", + required=False, + help="List of dataset types to filter the query. Default is an empty list.", + ) + parser.add_argument( + "--species", + type=str, + nargs="*", + default=[], + required=False, + help="List of Species Production names to filter the query. Default is an empty list.", + ) + parser.add_argument( + "--antispecies", + type=str, + nargs="*", + default=[], + required=False, + help="List of Species Production names to exclude from the query. Default is an empty list.", + ) + parser.add_argument( + "--release_id", + type=int, + default=0, + required=False, + help="Genome_dataset release_id to filter the query. Default is 0 (no filter).", + ) + parser.add_argument( + "--dataset_status", + nargs="*", + default=["Submitted"], + choices=["Submitted", "Processing", "Processed", "Released"], + required=False, + help="List of dataset statuses to filter the query. Default is an empty list.", + ) + parser.add_argument( + "--update_dataset_status", + type=str, + default="", + required=False, + choices=["Submitted", "Processing", "Processed", "Released", ""], + help="Update the status of the selected datasets to the specified value. ", + ) + parser.add_argument( + "--batch_size", + type=int, + default=50, + required=False, + help="Number of results to retrieve per batch. Default is 50.", + ) + parser.add_argument( + "--page", + default=1, + required=False, + type=lambda x: int(x) if int(x) > 0 else argparse.ArgumentTypeError("{x} is not a positive integer"), + help="The page number for pagination. Default is 1.", + ) + parser.add_argument( + "--metadata_db_uri", + type=str, + required=True, + help="metadata db mysql uri, ex: mysql://ensro@localhost:3366/ensembl_genome_metadata", ) - parser.add_argument('--genome_uuid', type=str, nargs='*', default=[], required=False, - help='List of genome UUIDs to filter the query. Default is an empty list.') - parser.add_argument('--dataset_uuid', type=str, nargs='*', default=[], required=False, - help='List of dataset UUIDs to filter the query. Default is an empty list.') - parser.add_argument('--organism_group_type', type=str, default='DIVISION', required=False, - help='Organism group type to filter the query. Default is "DIVISION"') - parser.add_argument('--division', type=str, nargs='*', default=[], required=False, - help='List of organism group names to filter the query. Default is an empty list.') - parser.add_argument('--dataset_type', type=str, default="assembly", required=False, - help='List of dataset types to filter the query. Default is an empty list.') - parser.add_argument('--species', type=str, nargs='*', default=[], required=False, - help='List of Species Production names to filter the query. Default is an empty list.') - parser.add_argument('--antispecies', type=str, nargs='*', default=[], required=False, - help='List of Species Production names to exclude from the query. Default is an empty list.') - parser.add_argument('--release_id', type=int, default=0, required=False, - help='Genome_dataset release_id to filter the query. Default is 0 (no filter).') - parser.add_argument('--dataset_status', nargs='*', default=["Submitted"], - choices=['Submitted', 'Processing', 'Processed', 'Released'], required=False, - help='List of dataset statuses to filter the query. Default is an empty list.') - parser.add_argument('--update_dataset_status', type=str, default="", required=False, - choices=['Submitted', 'Processing', 'Processed', 'Released', ''], - help='Update the status of the selected datasets to the specified value. ') - parser.add_argument('--batch_size', type=int, default=50, required=False, - help='Number of results to retrieve per batch. Default is 50.') - parser.add_argument('--page', default=1, required=False, - type=lambda x: int(x) if int(x) > 0 else argparse.ArgumentTypeError( - "{x} is not a positive integer"), - help='The page number for pagination. Default is 1.') - parser.add_argument('--metadata_db_uri', type=str, required=True, - help='metadata db mysql uri, ex: mysql://ensro@localhost:3366/ensembl_genome_metadata') - parser.add_argument('--output', type=str, required=True, help='output file ex: genome_info.json') + parser.add_argument("--output", type=str, required=True, help="output file ex: genome_info.json") args = parser.parse_args() meta_details = re.match(r"mysql:\/\/.*:?(.*?)@(.*?):\d+\/(.*)", args.metadata_db_uri) - with open(args.output, 'w') as json_output: - logger.info(f'Connecting Metadata Database with host:{meta_details.group(2)} & dbname:{meta_details.group(3)}') + with open(args.output, "w") as json_output: + logger.info( + f"Connecting Metadata Database with host:{meta_details.group(2)} & dbname:{meta_details.group(3)}" + ) genome_fetcher = GenomeFactory() - logger.info(f'Writing Results to {args.output}') - for genome in genome_fetcher.get_genomes( + logger.info(f"Writing Results to {args.output}") + for genome in ( + genome_fetcher.get_genomes( metadata_db_uri=args.metadata_db_uri, update_dataset_status=args.update_dataset_status, genome_uuid=args.genome_uuid, @@ -241,13 +369,15 @@ def main(): batch_size=args.batch_size, release_id=args.release_id, dataset_status=args.dataset_status, - ) or []: + ) + or [] + ): json.dump(genome, json_output) json_output.write("\n") - logger.info(f'Completed !') + logger.info(f"Completed !") if __name__ == "__main__": - logger.info('Fetching Genome Information From New Metadata Database') + logger.info("Fetching Genome Information From New Metadata Database") main() diff --git a/src/ensembl/production/metadata/api/factories/release.py b/src/ensembl/production/metadata/api/factories/release.py index 9d4683f2..56793c71 100644 --- a/src/ensembl/production/metadata/api/factories/release.py +++ b/src/ensembl/production/metadata/api/factories/release.py @@ -86,11 +86,21 @@ def init_release( version = round(version, 1) # Validate release date only if provided + release_date_obj = None if release_date: try: - datetime.strptime(release_date, "%Y-%m-%d").date() + release_date_obj = datetime.strptime(release_date, "%Y-%m-%d").date() except ValueError: raise ValueError("Invalid release_date format. Expected YYYY-MM-DD.") + else: + if label: + try: + release_date_obj = datetime.strptime(label, "%Y-%m-%d").date() + release_date = label # Store the string for later label assignment + except ValueError: + raise ValueError("Invalid label format. Expected YYYY-MM-DD when used as date.") + else: + raise ValueError("Either release_date or label must be specified.") # Create a name if not provided. It should be one higher than any existing partial release. if not name and release_type == "partial": @@ -100,8 +110,6 @@ def init_release( # Ensure label is defined if label is None: - if release_date is None: - raise ValueError("Either release_date or label must be specified.") label = release_date # Validate release type @@ -115,7 +123,7 @@ def init_release( # Create and store the new release release = EnsemblRelease( version=version, - release_date=release_date, # Will be stored as NULL if None + release_date=release_date_obj, label=label, ensembl_site=site_obj, release_type=release_type, diff --git a/src/ensembl/production/metadata/api/factories/utils.py b/src/ensembl/production/metadata/api/factories/utils.py index 3ec58aeb..005e3d21 100644 --- a/src/ensembl/production/metadata/api/factories/utils.py +++ b/src/ensembl/production/metadata/api/factories/utils.py @@ -10,10 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from sqlalchemy.orm import aliased - -from ensembl.production.metadata.api.models import Dataset, Genome, GenomeDataset, DatasetAttribute, Attribute, Assembly - +from ensembl.production.metadata.api.models import Genome, Assembly def get_genome_sets_by_assembly_and_provider(session): """ @@ -26,47 +23,24 @@ def get_genome_sets_by_assembly_and_provider(session): DOES NOT HAVE A TEST. NOR DO WE HAVE UPDATES IN OUR TEST DB. BIG WORK TO UPDATE THIS. """ - # Aliases for clarity - genome_alias = aliased(Genome) - dataset_alias = aliased(Dataset) - dataset_attr_provider = aliased(DatasetAttribute) # Attribute for genebuild.provider - dataset_attr_geneset = aliased(DatasetAttribute) # Attribute for genebuild.last_geneset_update - attribute_provider = aliased(Attribute) - attribute_geneset = aliased(Attribute) - assembly_alias = aliased(Assembly) - - # Query to retrieve genome_uuid, assembly_uuid, provider, and last_geneset_update + # Query to retrieve genome_uuid, assembly_uuid, provider_name, and genebuild_date query = ( session.query( - genome_alias.genome_uuid, - assembly_alias.assembly_uuid, - dataset_attr_provider.value.label("provider_name"), - dataset_attr_geneset.value.label("last_geneset_update") - ) - .join(assembly_alias, genome_alias.assembly_id == assembly_alias.assembly_id) - .join(GenomeDataset, GenomeDataset.genome_id == genome_alias.genome_id) - .join(dataset_alias, GenomeDataset.dataset_id == dataset_alias.dataset_id) - # Join for provider attribute - .join(dataset_attr_provider, dataset_attr_provider.dataset_id == dataset_alias.dataset_id) - .join(attribute_provider, dataset_attr_provider.attribute_id == attribute_provider.attribute_id) - # Join for last_geneset_update attribute - .join(dataset_attr_geneset, dataset_attr_geneset.dataset_id == dataset_alias.dataset_id) - .join(attribute_geneset, dataset_attr_geneset.attribute_id == attribute_geneset.attribute_id) - .filter( - dataset_alias.dataset_type.has(name="genebuild"), # Ensure dataset is of type genebuild - attribute_provider.name == "genebuild.provider_name", # Ensure attribute is genebuild.provider_name - attribute_geneset.name == "genebuild.last_geneset_update" - # Ensure attribute is genebuild.last_geneset_update + Genome.genome_uuid, + Assembly.assembly_uuid, + Genome.provider_name, + Genome.genebuild_date ) + .join(Assembly, Genome.assembly_id == Assembly.assembly_id) ) # Organize results into a dictionary grouping genome_uuids by (assembly_uuid, provider) genome_sets = {} - for genome_uuid, assembly_uuid, provider, last_geneset_update in query.all(): - key = (assembly_uuid, provider) + for genome_uuid, assembly_uuid, provider_name, genebuild_date in query.all(): + key = (assembly_uuid, provider_name) if key not in genome_sets: genome_sets[key] = [] - genome_sets[key].append((genome_uuid, last_geneset_update)) # Keep last_geneset_update with each genome + genome_sets[key].append((genome_uuid, genebuild_date)) # Create a filtered dictionary where only groups with more than one genome are kept genome_sets_with_multiple = {key: genomes for key, genomes in genome_sets.items() if len(genomes) > 1} diff --git a/src/ensembl/production/metadata/api/factory.py b/src/ensembl/production/metadata/api/factory.py index 98191434..da6cb786 100644 --- a/src/ensembl/production/metadata/api/factory.py +++ b/src/ensembl/production/metadata/api/factory.py @@ -16,7 +16,7 @@ from ensembl.production.metadata.updater.core import CoreMetaUpdater -def meta_factory(db_uri, metadata_uri, force=False): +def meta_factory(db_uri, metadata_uri, taxonomy_uri): db_url = make_url(db_uri) if '_compara_' in db_url.database: raise Exception("compara not implemented yet") @@ -30,7 +30,7 @@ def meta_factory(db_uri, metadata_uri, force=False): elif '_funcgen_' in db_url.database: raise Exception("funcgen not implemented yet") elif '_core_' in db_url.database: - return CoreMetaUpdater(db_uri, metadata_uri) + return CoreMetaUpdater(db_uri, metadata_uri, taxonomy_uri) elif '_otherfeatures_' in db_url.database: raise Exception("otherfeatures not implemented yet") elif '_rnaseq_' in db_url.database: diff --git a/src/ensembl/production/metadata/api/models/assembly.py b/src/ensembl/production/metadata/api/models/assembly.py index 99929c37..aaac492a 100644 --- a/src/ensembl/production/metadata/api/models/assembly.py +++ b/src/ensembl/production/metadata/api/models/assembly.py @@ -17,30 +17,28 @@ from ensembl.production.metadata.api.models.base import Base, LoadAble -__all__ = ['Assembly', 'AssemblySequence'] +__all__ = ["Assembly", "AssemblySequence", "SequenceAlias"] class Assembly(LoadAble, Base): - __tablename__ = 'assembly' + __tablename__ = "assembly" assembly_id = Column(Integer, primary_key=True) - assembly_uuid = Column(String(32), unique=True, nullable=False, default=uuid.uuid4) + assembly_uuid = Column(String(40), unique=True, nullable=False, default=lambda: str(uuid.uuid4())) ucsc_name = Column(String(16)) - accession = Column(String(16), nullable=False, unique=True) + accession = Column(String(16), nullable=False) level = Column(String(32), nullable=False) name = Column(String(128), nullable=False) accession_body = Column(String(32)) assembly_default = Column(String(128)) - tol_id = Column(String(32), unique=True) created = Column(DateTime) ensembl_name = Column(String(255), unique=True) - alt_accession = Column(String(16), nullable=True) is_reference = Column(TINYINT(1), nullable=False, default=0) - url_name = Column(String(128), nullable=False) # One to many relationships # assembly_id within assembly_sequence - assembly_sequences = relationship("AssemblySequence", back_populates="assembly", - cascade="all, delete, delete-orphan") + assembly_sequences = relationship( + "AssemblySequence", back_populates="assembly", cascade="all, delete, delete-orphan" + ) # assembly_id within genome genomes = relationship("Genome", back_populates="assembly", cascade="all, delete, delete-orphan") @@ -52,32 +50,52 @@ def is_released(self): class AssemblySequence(LoadAble, Base): - __tablename__ = 'assembly_sequence' + __tablename__ = "assembly_sequence" __table_args__ = ( - Index('assembly_sequence_assembly_id_accession_5f3e5119_uniq', 'assembly_id', 'accession', unique=True), + Index( + "assembly_sequence_assembly_id_accession_5f3e5119_uniq", "assembly_id", "accession", unique=True + ), ) assembly_sequence_id = Column(Integer, primary_key=True) - name = Column(String(128), unique=True) - assembly_id = Column(ForeignKey('assembly.assembly_id'), nullable=False, index=True) + name = Column(String(128)) + assembly_id = Column(ForeignKey("assembly.assembly_id"), nullable=False, index=True) accession = Column(String(128), nullable=False) chromosomal = Column(TINYINT(1), nullable=False, default=0) chromosome_rank = Column(Integer) length = Column(Integer, nullable=False) sequence_location = Column(String(10)) md5 = Column(String(32)) - # column need renaming as well sha512t24u = Column(String(128)) - type = Column(Enum('chromosome_group', 'plasmid', 'primary_assembly', 'contig', 'chromosome', 'scaffold', 'lrg', - 'supercontig', 'supscaffold'), server_default=text("'primary_assembly'")) + type = Column( + Enum( + "chromosome_group", + "plasmid", + "primary_assembly", + "contig", + "chromosome", + "scaffold", + "lrg", + "supercontig", + "supscaffold", + "non_ref_scaffold", + ), + server_default=text("'primary_assembly'"), + nullable=False, + ) is_circular = Column(TINYINT(1), nullable=False, default=0) - assembly = relationship('Assembly', back_populates="assembly_sequences") + additional = Column(TINYINT(1), nullable=False, default=0) + source = Column(String(128)) + assembly = relationship("Assembly", back_populates="assembly_sequences") + sequence_aliases = relationship("SequenceAlias", back_populates="assembly_sequence", + cascade="all, delete, delete-orphan") + - # backward compatibility with old column name sha512t2u - @property - def sha512t4u(self): - return self.sha512t24u +class SequenceAlias(LoadAble, Base): + __tablename__ = "sequence_alias" - @sha512t4u.setter - def sha512t4u(self, checksum): - self.sha512t24u = checksum + sequence_alias_id = Column(Integer, primary_key=True) + assembly_sequence_id = Column(ForeignKey("assembly_sequence.assembly_sequence_id"), nullable=False) + alias = Column(String(128), nullable=False) + source = Column(String(128)) + assembly_sequence = relationship("AssemblySequence", back_populates="sequence_aliases") diff --git a/src/ensembl/production/metadata/api/models/base.py b/src/ensembl/production/metadata/api/models/base.py index 788d6bc3..d9931eb2 100644 --- a/src/ensembl/production/metadata/api/models/base.py +++ b/src/ensembl/production/metadata/api/models/base.py @@ -9,7 +9,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import declarative_base Base = declarative_base() diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 5303092e..c71f3b7e 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -15,7 +15,7 @@ import uuid import sqlalchemy -from sqlalchemy import Column, Integer, String, text, ForeignKey, Index, JSON +from sqlalchemy import Column, Integer, String, text, ForeignKey, Index from sqlalchemy.dialects.mysql import DATETIME, TINYINT from sqlalchemy.orm import relationship, backref from sqlalchemy.sql import func @@ -34,6 +34,7 @@ class DatasetStatus(enum.Enum): PROCESSED = "Processed" RELEASED = "Released" FAULTY = "Faulty" + SUPPRESSED = "Suppressed" DatasetStatusType = sqlalchemy.types.Enum( @@ -47,22 +48,20 @@ class Attribute(LoadAble, Base): __tablename__ = 'attribute' attribute_id = Column(Integer, primary_key=True) - name = Column(String(128), nullable=False) + name = Column(String(128), nullable=False, unique=True) label = Column(String(128), nullable=False) description = Column(String(255)) required = Column(TINYINT(1), nullable=False, default=0) type = Column(Enum('string', 'percent', 'float', 'integer', 'bp', 'number'), server_default=text("'string'")) # One to many relationships - # attribute_id within dataset attribute dataset_attributes = relationship("DatasetAttribute", back_populates='attribute') - # many to one relationships class Dataset(LoadAble, Base): __tablename__ = 'dataset' dataset_id = Column(Integer, primary_key=True) - dataset_uuid = Column(String(32), nullable=False, unique=True, default=str(uuid.uuid4)) + dataset_uuid = Column(String(40), nullable=False, unique=True, default=lambda: str(uuid.uuid4())) dataset_type_id = Column(ForeignKey('dataset_type.dataset_type_id'), nullable=False, index=True) name = Column(String(128), nullable=False) version = Column(String(128)) @@ -70,19 +69,15 @@ class Dataset(LoadAble, Base): dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) label = Column(String(128), nullable=False) status = Column(DatasetStatusType, server_default=text('Submitted')) + parent_id = Column(Integer, ForeignKey('dataset.dataset_id'), nullable=True, index=True) # One to many relationships - # dataset_id to dataset attribute and genome dataset dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', cascade="all, delete, delete-orphan") genome_datasets = relationship("GenomeDataset", back_populates='dataset', cascade="all, delete, delete-orphan") # many to one relationships - # dataset_type_id to dataset_type dataset_type = relationship('DatasetType', back_populates="datasets") - # dataset_source_id to dataset source dataset_source = relationship('DatasetSource', back_populates="datasets") - # parent dataset when created - parent_id = Column(Integer, ForeignKey('dataset.dataset_id'), nullable=True, index=True) children = relationship('Dataset', backref=backref("parent", remote_side=[dataset_id])) @property @@ -118,12 +113,8 @@ class DatasetAttribute(LoadAble, Base): value = Column(String(255), nullable=False) attribute_id = Column(ForeignKey('attribute.attribute_id'), nullable=False, index=True) dataset_id = Column(ForeignKey('dataset.dataset_id'), nullable=False, index=True) - # One to many relationships - # none # many to one relationships - # dataset_attribute_id to dataset attribute = relationship('Attribute', back_populates="dataset_attributes") - # attribute_id to attribute dataset = relationship('Dataset', back_populates="dataset_attributes") @@ -133,27 +124,18 @@ class DatasetSource(LoadAble, Base): dataset_source_id = Column(Integer, primary_key=True) type = Column(String(32), nullable=False) name = Column(String(255), nullable=False, unique=True) + location = Column(String(120)) # One to many relationships - # dataset_source_id to dataset datasets = relationship('Dataset', back_populates='dataset_source') - # many to one relationships - # none - class DatasetType(LoadAble, Base): __tablename__ = 'dataset_type' dataset_type_id = Column(Integer, primary_key=True) - name = Column(String(32), nullable=False) + name = Column(String(32), nullable=False, unique=True) label = Column(String(128), nullable=False) topic = Column(String(32), nullable=False) description = Column(String(255)) - details_uri = Column(String(255)) parent = Column(ForeignKey('dataset_type.dataset_type_id'), name='parent_id', nullable=True, index=True) - depends_on = Column(String(128), default=None) - filter_on = Column(JSON, default=None) # One to many relationships - # dataset_type_id to dataset datasets = relationship('Dataset', back_populates='dataset_type') - # many to one relationships - # none diff --git a/src/ensembl/production/metadata/api/models/genome.py b/src/ensembl/production/metadata/api/models/genome.py index 57f55a93..64bdc7f4 100644 --- a/src/ensembl/production/metadata/api/models/genome.py +++ b/src/ensembl/production/metadata/api/models/genome.py @@ -12,13 +12,13 @@ import logging import uuid -from sqlalchemy import Column, Integer, String, ForeignKey, UniqueConstraint +from sqlalchemy import Column, Integer, String, ForeignKey, UniqueConstraint, Enum from sqlalchemy.dialects.mysql import DATETIME, TINYINT from sqlalchemy.orm import relationship from ensembl.production.metadata.api.models.base import Base, LoadAble -__all__ = ['Genome', 'GenomeDataset', 'GenomeRelease'] +__all__ = ["Genome", "GenomeDataset", "GenomeRelease", "GenomeGroup", "GenomeGroupMember"] logger = logging.getLogger(__name__) @@ -27,18 +27,28 @@ class Genome(LoadAble, Base): __tablename__ = "genome" genome_id = Column(Integer, primary_key=True) - genome_uuid = Column(String(32), nullable=False, unique=True, default=str(uuid.uuid4)) + genome_uuid = Column(String(32), nullable=False, unique=True, default=lambda: str(uuid.uuid4())) assembly_id = Column(ForeignKey("assembly.assembly_id"), nullable=False, index=True) organism_id = Column(ForeignKey("organism.organism_id"), nullable=False, index=True) created = Column(DATETIME(fsp=6), nullable=False) - is_best = Column(TINYINT(1), nullable=False, default=0) - production_name = Column(String(255), nullable=False, unique=False) - genebuild_version = Column(String(64), nullable=False, unique=False) + production_name = Column(String(120), nullable=False, unique=False) + annotation_source = Column(String(120), nullable=False, unique=False) + provider_name = Column(String(120), nullable=False, unique=False) genebuild_date = Column(String(20), nullable=False, unique=False) + suppressed = Column(TINYINT(1), nullable=False, default=0) + suppression_details = Column(String(255), nullable=True, unique=False) + url_name = Column(String(128), nullable=True, unique=False) # One to many relationships # genome_id to genome_dataset and genome release - genome_datasets = relationship("GenomeDataset", back_populates="genome", cascade="all, delete, delete-orphan") - genome_releases = relationship("GenomeRelease", back_populates="genome", cascade="all, delete, delete-orphan") + genome_datasets = relationship( + "GenomeDataset", back_populates="genome", cascade="all, delete, delete-orphan" + ) + genome_releases = relationship( + "GenomeRelease", back_populates="genome", cascade="all, delete, delete-orphan" + ) + genome_group_members = relationship( + "GenomeGroupMember", back_populates="genome", cascade="all, delete, delete-orphan" + ) # many to one relationships # assembly_id to assembly assembly = relationship("Assembly", back_populates="genomes") @@ -46,7 +56,6 @@ class Genome(LoadAble, Base): organism = relationship("Organism", back_populates="genomes") - class GenomeDataset(LoadAble, Base): __tablename__ = "genome_dataset" @@ -58,12 +67,17 @@ class GenomeDataset(LoadAble, Base): UniqueConstraint("genome_id", "dataset_id", "release_id", name="genome_dataset_release_uidx"), # genome_dataset_id to genome - dataset = relationship("Dataset", back_populates="genome_datasets", order_by='Dataset.name, desc(Dataset.created)') + dataset = relationship( + "Dataset", back_populates="genome_datasets", order_by="Dataset.name, desc(Dataset.created)" + ) # genome_id to genome - genome = relationship("Genome", back_populates="genome_datasets", order_by='Dataset.name, desc(Genome.created)') + genome = relationship( + "Genome", back_populates="genome_datasets", order_by="Dataset.name, desc(Genome.created)" + ) # release_id to release - ensembl_release = relationship("EnsemblRelease", back_populates="genome_datasets", - order_by='desc(EnsemblRelease.version)') + ensembl_release = relationship( + "EnsemblRelease", back_populates="genome_datasets", order_by="desc(EnsemblRelease.version)" + ) class GenomeRelease(LoadAble, Base): @@ -74,10 +88,39 @@ class GenomeRelease(LoadAble, Base): genome_id = Column(ForeignKey("genome.genome_id"), nullable=False, index=True) release_id = Column(ForeignKey("ensembl_release.release_id"), nullable=False, index=True) is_current = Column(TINYINT(1), nullable=False, default=0) - # One to many relationships - # none # many to one relationships # genome_release_id to genome_release genome = relationship("Genome", back_populates="genome_releases") # release_id to ensembl release ensembl_release = relationship("EnsemblRelease", back_populates="genome_releases") + + +class GenomeGroup(LoadAble, Base): + __tablename__ = "genome_group" + + genome_group_id = Column(Integer, primary_key=True) + type = Column(Enum("compara_reference", "structural_variant", "project"), nullable=False) + name = Column(String(128), nullable=False, unique=True) + label = Column(String(128)) + searchable = Column(TINYINT(1), nullable=False, default=0) + description = Column(String(255)) + + # One to many relationships + # genome_group_id to organism_group_member + genome_group_members = relationship("GenomeGroupMember", back_populates="genome_group") + + +class GenomeGroupMember(LoadAble, Base): + __tablename__ = "genome_group_member" + + genome_group_member_id = Column(Integer, primary_key=True) + is_reference = Column(TINYINT(1), nullable=False, default=0) + genome_id = Column(ForeignKey("genome.genome_id"), nullable=False) + genome_group_id = Column(ForeignKey("genome_group.genome_group_id"), nullable=False) + release_id = Column(ForeignKey("ensembl_release.release_id")) + is_current = Column(TINYINT(1), nullable=False, default=0) + + # many to one relationships + genome_group = relationship("GenomeGroup", back_populates="genome_group_members") + genome = relationship("Genome", back_populates="genome_group_members") + ensembl_release = relationship("EnsemblRelease", back_populates="genome_group_members") diff --git a/src/ensembl/production/metadata/api/models/organism.py b/src/ensembl/production/metadata/api/models/organism.py index 92bdb110..99156d66 100644 --- a/src/ensembl/production/metadata/api/models/organism.py +++ b/src/ensembl/production/metadata/api/models/organism.py @@ -17,14 +17,14 @@ from ensembl.production.metadata.api.models.base import Base, LoadAble -__all__ = ['Organism', 'OrganismGroup', 'OrganismGroupMember'] +__all__ = ["Organism", "OrganismGroup", "OrganismGroupMember"] class Organism(LoadAble, Base): __tablename__ = "organism" organism_id = Column(Integer, primary_key=True) - organism_uuid = Column(String(32), unique=True, nullable=False, default=uuid.uuid4) + organism_uuid = Column(String(40), unique=True, nullable=False, default=lambda: str(uuid.uuid4())) taxonomy_id = Column(Integer, nullable=False) species_taxonomy_id = Column(Integer) common_name = Column(String(128), nullable=True) @@ -32,37 +32,37 @@ class Organism(LoadAble, Base): scientific_name = Column(String(128)) biosample_id = Column(String(128), nullable=False, unique=True) scientific_parlance_name = Column(String(255)) + rank = Column(Integer, default=0) + strain_type = Column(String(128), nullable=True, unique=False) + tol_id = Column(String(32)) # One to many relationships # Organism_id to organism_group_member and genome genomes = relationship("Genome", back_populates="organism", cascade="all, delete, delete-orphan") organism_group_members = relationship("OrganismGroupMember", back_populates="organism") - strain_type = Column(String(128), nullable=True, unique=False) - class OrganismGroup(LoadAble, Base): __tablename__ = "organism_group" - __table_args__ = ( - Index("group_type_name_63c2f6ac_uniq", "type", "name", unique=True), - ) + __table_args__ = (Index("group_type_name_63c2f6ac_uniq", "type", "name", unique=True),) organism_group_id = Column(Integer, primary_key=True) - type = Column(String(32), nullable=False) + type = Column(String(32)) name = Column(String(255), nullable=False) code = Column(String(48), unique=True) # One to many relationships # Organism_group_id to organism_group_member organism_group_members = relationship("OrganismGroupMember", back_populates="organism_group") - # many to one relationships - # none - class OrganismGroupMember(LoadAble, Base): __tablename__ = "organism_group_member" __table_args__ = ( - Index("organism_group_member_organism_id_organism_gro_fe8f49ac_uniq", "organism_id", "organism_group_id", - unique=True), + Index( + "organism_group_member_organism_id_organism_gro_fe8f49ac_uniq", + "organism_id", + "organism_group_id", + unique=True, + ), ) organism_group_member_id = Column(Integer, primary_key=True) @@ -70,8 +70,6 @@ class OrganismGroupMember(LoadAble, Base): order = Column(Integer, nullable=True) organism_id = Column(ForeignKey("organism.organism_id"), nullable=False) organism_group_id = Column(ForeignKey("organism_group.organism_group_id"), nullable=False, index=True) - # One to many relationships - # none # many to one relationships # Organism_group_id to organism_group_member # organism_id to organism diff --git a/src/ensembl/production/metadata/api/models/release.py b/src/ensembl/production/metadata/api/models/release.py index 449bc2fa..dfdda00d 100644 --- a/src/ensembl/production/metadata/api/models/release.py +++ b/src/ensembl/production/metadata/api/models/release.py @@ -12,7 +12,7 @@ import enum import sqlalchemy -from sqlalchemy import Column, Integer, String, Index, DECIMAL, Date, ForeignKey +from sqlalchemy import Column, Integer, String, Index, DECIMAL, Date, ForeignKey, Enum from sqlalchemy.dialects.mysql import TINYINT from sqlalchemy.orm import relationship @@ -52,19 +52,21 @@ class EnsemblRelease(LoadAble, Base): Index('ensembl_release_version_site_id_b743399a_uniq', 'version', 'site_id', unique=True), ) - release_id = Column(Integer, primary_key=True, nullable=True) + release_id = Column(Integer, primary_key=True) version = Column(DECIMAL(10, 1), nullable=False) - release_date = Column(Date, nullable=True) - label = Column(String(64)) + release_date = Column(Date, nullable=False) + label = Column(String(64), nullable=False) is_current = Column(TINYINT(1), nullable=False, default=0) site_id = Column(ForeignKey('ensembl_site.site_id'), index=True) - release_type = Column(String(16), nullable=False) + release_type = Column(Enum('partial', 'integrated'), nullable=False) status = Column(ReleaseStatusType, nullable=False, default=ReleaseStatus.PLANNED) - name = Column(String(3), nullable=False) + name = Column(String(3)) # One to many relationships # release_id to genome dataset and genome release genome_datasets = relationship('GenomeDataset', back_populates='ensembl_release') genome_releases = relationship('GenomeRelease', back_populates='ensembl_release') + genome_group_members = relationship('GenomeGroupMember', back_populates='ensembl_release') + # many to one relationships # Added fileter condition on every join to EnsemblSite for code clarity # No other than configure site data should be returned diff --git a/src/ensembl/production/metadata/api/models/taxonomy.py b/src/ensembl/production/metadata/api/models/taxonomy.py index fe5d066c..ca326032 100644 --- a/src/ensembl/production/metadata/api/models/taxonomy.py +++ b/src/ensembl/production/metadata/api/models/taxonomy.py @@ -13,6 +13,7 @@ from ensembl.ncbi_taxonomy.models import NCBITaxaName from ensembl.ncbi_taxonomy.models import NCBITaxaNode +__all__ = ['EnsemblTaxaNode', 'EnsemblTaxaName', 'NCBITaxaName', 'NCBITaxaNode'] class EnsemblTaxaNode(NCBITaxaNode): __tablename__ = 'ncbi_taxa_node' diff --git a/src/ensembl/production/metadata/api/search/search.py b/src/ensembl/production/metadata/api/search/search.py new file mode 100644 index 00000000..2d4dcda5 --- /dev/null +++ b/src/ensembl/production/metadata/api/search/search.py @@ -0,0 +1,344 @@ +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, List + +from ensembl.utils.database import DBConnection +from pydantic import BaseModel +from sqlalchemy.orm import Session, joinedload + +from ensembl.production.metadata.api.models import ( + Genome, Dataset, DatasetAttribute, + EnsemblRelease, GenomeRelease, GenomeDataset, Attribute, + ReleaseStatus +) + + +# ============================================================================ +# PYDANTIC SCHEMAS +# ============================================================================ + +class GenomeSearchDocument(BaseModel): + """Schema for genome search indexing""" + + # Direct fields from Genome/Organism/Assembly + genome_uuid: str + common_name: Optional[str] = None + scientific_name: str + strain_type: Optional[str] = None + strain: Optional[str] = None + assembly_name: str + accession: str + url_name: Optional[str] = None + tol_id: Optional[str] = None + is_reference: bool + species_taxonomy_id: int + taxonomy_id: int + scientific_parlance_name: Optional[str] = None + organism_id: int + rank: int = 0 + + # Complex derived fields from datasets + contig_n50: int + coding_genes: int + has_variation: bool = False + has_regulation: bool = False + genebuild_provider: str + genebuild_method_display: str + + # Release information + release_type: str + release_label: str + release_id: int + + class Config: + from_attributes = True + + +# ============================================================================ +# QUERY HELPER CLASS +# ============================================================================ + +class GenomeSearchQueryHelper: + """Handles complex queries for extracting genome search data""" + + def __init__(self, session: Session): + self.session = session + + def _get_dataset_attribute_value( + self, + genome_id: int, + release_id: int, + release_type: str, + dataset_type_name: str, + attribute_name: str + ) -> Optional[str]: + """ + Get dataset attribute value with complex release logic + + If release is integrated: use dataset with matching release_id + If release is partial: use is_current dataset + """ + # Build the query + query = ( + self.session.query(DatasetAttribute.value) + .join(Attribute, DatasetAttribute.attribute_id == Attribute.attribute_id) + .join(Dataset, DatasetAttribute.dataset_id == Dataset.dataset_id) + .join(Dataset.dataset_type) + .join(GenomeDataset, Dataset.dataset_id == GenomeDataset.dataset_id) + .filter( + GenomeDataset.genome_id == genome_id, + Dataset.dataset_type.has(name=dataset_type_name), + Attribute.name == attribute_name + ) + ) + + # Apply release-specific filtering + if release_type == 'integrated': + query = query.filter(GenomeDataset.release_id == release_id) + else: # partial + query = query.filter(GenomeDataset.is_current == 1) + + result = query.first() + return result[0] if result else None + + def _has_dataset_type( + self, + genome_id: int, + release_id: int, + release_type: str, + dataset_type_name: str + ) -> bool: + """Check if genome has a dataset of specific type""" + query = ( + self.session.query(GenomeDataset) + .join(Dataset, GenomeDataset.dataset_id == Dataset.dataset_id) + .join(Dataset.dataset_type) + .filter( + GenomeDataset.genome_id == genome_id, + Dataset.dataset_type.has(name=dataset_type_name) + ) + ) + + if release_type == 'integrated': + query = query.filter(GenomeDataset.release_id == release_id) + else: # partial + query = query.filter(GenomeDataset.is_current == 1) + + return self.session.query(query.exists()).scalar() + + def _get_genebuild_provider( + self, + genome: Genome, + release_id: int, + release_type: str + ) -> Optional[str]: + """Get genebuild provider with fallback logic""" + # Try to get from dataset attribute first + provider = self._get_dataset_attribute_value( + genome.genome_id, + release_id, + release_type, + 'genebuild', + 'genebuild.provider_name_display' + ) + + # Fallback to genome.provider_name + return provider if provider else genome.provider_name + + def extract_genome_data( + self, + genome: Genome, + release: EnsemblRelease + ) -> dict: + """Extract all required data from genome for given release""" + + return { + # Direct fields + 'genome_uuid': genome.genome_uuid, + 'common_name': genome.organism.common_name, + 'scientific_name': genome.organism.scientific_name, + 'strain_type': genome.organism.strain_type, + 'strain': genome.organism.strain, + 'assembly_name': genome.assembly.name, + 'accession': genome.assembly.accession, + 'url_name': genome.url_name, + 'tol_id': genome.organism.tol_id, + 'is_reference': bool(genome.assembly.is_reference), + 'species_taxonomy_id': genome.organism.species_taxonomy_id, + 'scientific_parlance_name': genome.organism.scientific_parlance_name, + 'organism_id': genome.organism_id, + 'rank': genome.organism.rank or 0, + + # Complex dataset fields + 'contig_n50': self._get_dataset_attribute_value( + genome.genome_id, release.release_id, release.release_type, + 'assembly', 'assembly.stats.contig_n50' + ), + 'coding_genes': self._get_dataset_attribute_value( + genome.genome_id, release.release_id, release.release_type, + 'genebuild', 'genebuild.stats.coding_genes' + ), + 'has_variation': self._has_dataset_type( + genome.genome_id, release.release_id, release.release_type, + 'variation' + ), + 'has_regulation': self._has_dataset_type( + genome.genome_id, release.release_id, release.release_type, + 'regulatory_features' + ), + 'genebuild_provider': self._get_genebuild_provider( + genome, release.release_id, release.release_type + ), + 'genebuild_method_display': self._get_dataset_attribute_value( + genome.genome_id, release.release_id, release.release_type, + 'genebuild', 'genebuild.method_display' + ), + + # Release fields + 'release_type': release.release_type, + 'release_label': release.label, + 'release_id': release.release_id, + } + + +# ============================================================================ +# MAIN SERVICE CLASS +# ============================================================================ + +class GenomeSearchIndexer: + """Service for generating genome search documents""" + + def __init__(self, metadata_uri: str): + self.metadata_db = DBConnection(metadata_uri, pool_size=cfg.pool_size, pool_recycle=cfg.pool_recycle) + self.session = session + self.query_helper = GenomeSearchQueryHelper(session) + + def _get_relevant_release(self, genome: Genome) -> Optional[EnsemblRelease]: + """ + Determine which release to use for a genome. + Prefer partial if exists, otherwise use integrated. + """ + releases = ( + self.session.query(EnsemblRelease) + .join(GenomeRelease) + .filter( + GenomeRelease.genome_id == genome.genome_id, + EnsemblRelease.status == ReleaseStatus.RELEASED + ) + .all() + ) + + # Check for partial release + partial_releases = [r for r in releases if r.release_type == 'partial'] + if partial_releases: + return partial_releases # Should only be one, but return list for consistency + + # Return all integrated releases + integrated_releases = [r for r in releases if r.release_type == 'integrated'] + return integrated_releases if integrated_releases else None + + def get_released_genomes(self) -> List[Genome]: + """Get all genomes that are released""" + return ( + self.session.query(Genome) + .join(GenomeRelease) + .join(EnsemblRelease) + .filter( + EnsemblRelease.status == ReleaseStatus.RELEASED, + Genome.suppressed == 0 + ) + .options( + joinedload(Genome.organism), + joinedload(Genome.assembly), + joinedload(Genome.genome_releases).joinedload(GenomeRelease.ensembl_release) + ) + .distinct() + .all() + ) + + def create_search_documents( + self, + genome: Genome + ) -> List[GenomeSearchDocument]: + """ + Create search documents for a genome. + Returns list because a genome can be in multiple integrated releases. + """ + releases = self._get_relevant_release(genome) + + if not releases: + return [] + + # Ensure releases is a list + if not isinstance(releases, list): + releases = [releases] + + documents = [] + for release in releases: + genome_data = self.query_helper.extract_genome_data(genome, release) + documents.append(GenomeSearchDocument(**genome_data)) + + return documents + + def generate_all_search_documents(self) -> List[GenomeSearchDocument]: + """Generate search documents for all released genomes""" + genomes = self.get_released_genomes() + all_documents = [] + + for genome in genomes: + documents = self.create_search_documents(genome) + all_documents.extend(documents) + + return all_documents + + def generate_search_documents_as_dicts(self) -> List[dict]: + """Generate search documents as dictionaries for indexing""" + documents = self.generate_all_search_documents() + return [doc.model_dump() for doc in documents] + + +# ============================================================================ +# USAGE EXAMPLES +# ============================================================================ + +def index_genomes_for_search(session: Session): + """Main entry point for generating search index data""" + indexer = GenomeSearchIndexer(session) + + # Get all documents as dicts ready for search indexing + search_documents = indexer.generate_search_documents_as_dicts() + + # Send to your search service (Elasticsearch, Solr, etc.) + # send_to_search_index(search_documents) + + return search_documents + + +def index_single_genome(session: Session, genome_uuid: str): + """Index a specific genome""" + indexer = GenomeSearchIndexer(session) + + genome = ( + session.query(Genome) + .filter(Genome.genome_uuid == genome_uuid) + .options( + joinedload(Genome.organism), + joinedload(Genome.assembly) + ) + .first() + ) + + if not genome: + raise ValueError(f"Genome {genome_uuid} not found") + + documents = indexer.create_search_documents(genome) + return [doc.model_dump() for doc in documents] diff --git a/src/ensembl/production/metadata/grpc/protobuf_msg_factory.py b/src/ensembl/production/metadata/grpc/protobuf_msg_factory.py index 8531c5a6..13d03703 100644 --- a/src/ensembl/production/metadata/grpc/protobuf_msg_factory.py +++ b/src/ensembl/production/metadata/grpc/protobuf_msg_factory.py @@ -9,10 +9,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging from datetime import datetime from ensembl.production.metadata.grpc import ensembl_metadata_pb2 -import logging logger = logging.getLogger(__name__) @@ -116,8 +116,6 @@ def create_assembly(data=None): ucsc_name=data.Assembly.ucsc_name, ensembl_name=data.Assembly.ensembl_name, is_reference=data.Assembly.is_reference, - url_name=data.Assembly.url_name, - tol_id=data.Assembly.tol_id, ) return assembly diff --git a/src/ensembl/production/metadata/grpc/utils.py b/src/ensembl/production/metadata/grpc/utils.py index f4893de1..0e4acef7 100644 --- a/src/ensembl/production/metadata/grpc/utils.py +++ b/src/ensembl/production/metadata/grpc/utils.py @@ -160,7 +160,7 @@ def create_genome_with_attributes_and_count(db_conn, genome, release_version): ) -def get_genomes_from_assembly_accession_iterator(db_conn, assembly_accession, release_version): +def get_genomes_from_assembly_accession_iterator(db_conn, assembly_accession): if not assembly_accession: logger.warning("Missing or Empty Assembly accession field.") return msg_factory.create_genome() diff --git a/src/ensembl/production/metadata/scripts/organism_to_organismgroup.py b/src/ensembl/production/metadata/scripts/organism_to_organismgroup.py index c85b1203..ffaf6592 100644 --- a/src/ensembl/production/metadata/scripts/organism_to_organismgroup.py +++ b/src/ensembl/production/metadata/scripts/organism_to_organismgroup.py @@ -1,12 +1,25 @@ -import os +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import logging +import os from ensembl.core.models import Meta from ensembl.utils.database import DBConnection -from ensembl.production.metadata.api.models.organism import OrganismGroup, OrganismGroupMember, Organism -from ensembl.production.metadata.api.models.genome import Genome, GenomeDataset, GenomeRelease + from ensembl.production.metadata.api.models.dataset import Dataset, DatasetSource +from ensembl.production.metadata.api.models.genome import Genome, GenomeDataset, GenomeRelease +from ensembl.production.metadata.api.models.organism import OrganismGroup, OrganismGroupMember, Organism # Set up the logging configuration logging.basicConfig( diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py index 35c69f5c..cda3cdf9 100644 --- a/src/ensembl/production/metadata/updater/base.py +++ b/src/ensembl/production/metadata/updater/base.py @@ -18,11 +18,13 @@ class BaseMetaUpdater: - def __init__(self, db_uri, metadata_uri, release=None): + def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): self.db_uri = db_uri self.metadata_uri = metadata_uri + self.taxonomy_uri = taxonomy_uri self.db = DBConnection(self.db_uri) self.metadata_db = DBConnection(metadata_uri) + self.taxonomy_db = DBConnection(taxonomy_uri) # We will add a release later. For now, the release must be specified for it to be used. if release is None: self.listed_release = None diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 92d7b030..4195d0f0 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License.` import logging +import re import uuid from collections import defaultdict @@ -27,7 +28,6 @@ from ensembl.production.metadata.api.factories.datasets import DatasetFactory from ensembl.production.metadata.api.models import * from ensembl.production.metadata.updater.base import BaseMetaUpdater -from ensembl.production.metadata.updater.updater_utils import update_attributes logging.basicConfig(level=logging.INFO) @@ -35,16 +35,17 @@ class CoreMetaUpdater(BaseMetaUpdater): - def __init__(self, db_uri, metadata_uri, release=None): - super().__init__(db_uri, metadata_uri, release) + def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): + super().__init__(db_uri, metadata_uri, taxonomy_uri, release) self.db_type = 'core' - # Single query to get all of the metadata information. self.meta_dict = {} self._load_meta_dict() self._validate_required_attributes() def _load_meta_dict(self): - """Load metadata into meta_dict from the database.""" + """Load metadata into meta_dict from the database. + Stores all values for each meta_key as a list to handle potential duplicates. + """ with self.db.session_scope() as session: results = session.query(Meta).filter(Meta.meta_value.isnot(None), Meta.meta_value.notin_(['', 'Null', 'NULL'])).all() @@ -54,40 +55,82 @@ def _load_meta_dict(self): meta_value = result.meta_value if species_id not in self.meta_dict: self.meta_dict[species_id] = {} - # WARNING! Duplicated meta_keys for a species_id will not error out!. A datacheck is necessary for key values. - self.meta_dict[species_id][meta_key] = meta_value + if meta_key not in self.meta_dict[species_id]: + self.meta_dict[species_id][meta_key] = [] + self.meta_dict[species_id][meta_key].append(meta_value) def _validate_required_attributes(self): """Check if all required attributes are present in the meta_dict for each species.""" - required_attribute_names = [] + # TODO: Move to datacheck with self.metadata_db.session_scope() as session: - # Query the attribute table to get all required attributes required_attributes = session.query(Attribute.name).filter(Attribute.required == 1).all() required_attribute_names = {attr.name for attr in required_attributes} - with self.db.session_scope() as session: - # Check each species_id in meta_dict - missing_attributes = {} - for species_id, meta in self.meta_dict.items(): - missing = required_attribute_names - set(meta.keys()) - if missing: - missing_attributes[species_id] = missing - - if missing_attributes: - exceptions.MissingMetaException( - "Species ID {species_id} is missing required attributes: {missing_attributes}") - - # Basic API for the meta table in the submission database. + missing_attributes = {} + for species_id, meta in self.meta_dict.items(): + if species_id is None: + continue + missing = required_attribute_names - set(meta.keys()) + if missing: + missing_attributes[species_id] = missing + + if missing_attributes: + error_msg = "\n".join([ + f"Species ID {species_id} is missing required attributes: {', '.join(sorted(missing))}" + for species_id, missing in missing_attributes.items() + ]) + raise exceptions.MissingMetaException(error_msg) + def get_meta_single_meta_key(self, species_id, parameter): + """ + Get a single value for a meta_key. + Raises an exception if multiple values exist for the same key. + + Returns: + str or None: The meta value, or None if not found + + Raises: + DuplicateMetaKeyException: If multiple values exist for the key + """ species_meta = self.meta_dict.get(species_id) if species_meta is None: return None - return species_meta.get(parameter) + + values = species_meta.get(parameter, [None]) + + if len(values) > 1: + raise exceptions.MetaException( + f"Species {species_id} has {len(values)} values for meta_key '{parameter}': {values}. " + f"A single key is currently required to successfully hand over." + ) + + return values[0] + + def get_meta_all_values(self, species_id, parameter): + """ + Get all values for a meta_key, handling cases with 0, 1, or multiple values. + + Returns: + list: List of all values for the key (empty list if none exist) + """ + species_meta = self.meta_dict.get(species_id) + if species_meta is None: + return [] + + return species_meta.get(parameter, []) def get_meta_list_from_prefix_meta_key(self, species_id, prefix): + """ + Get all meta_keys with a given prefix, including all values. + + Returns: + dict or None: Dictionary of {key: [values]} where values is always a list, + or None if species not found + """ species_meta = self.meta_dict.get(species_id) if species_meta is None: return None + result_dict = {k: v for k, v in species_meta.items() if k.startswith(prefix)} return result_dict @@ -134,7 +177,6 @@ def process_core(self, **kwargs): # Process each species in its own transaction with self.metadata_db.session_scope() as meta_session: self.process_species(species_id, meta_session) - # If we get here without exception, the species was successful successful_species.append((species_id, production_name)) if len(multi_species) > 1: logger.info(f"Successfully processed species {species_id}: {production_name}") @@ -192,8 +234,9 @@ def process_species(self, species_id, meta_session): organism = self.get_or_new_organism(species_id, meta_session) assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source = self.get_or_new_assembly( species_id, meta_session) - genebuild_dataset, genebuild_dataset_attributes = self.get_or_new_genebuild(species_id, meta_session, - dataset_source) + genebuild_dataset, genebuild_dataset_attributes = self._create_genebuild( + species_id, meta_session, dataset_source + ) # Checking for an existing genome uuid: old_genome_uuid = self.get_meta_single_meta_key(species_id, "genome.genome_uuid") @@ -203,15 +246,18 @@ def process_species(self, species_id, meta_session): if old_genome is not None: raise exceptions.MetadataUpdateException( f"Species {species_id}: Core database contains a genome.genome_uuid which matches an entry in the meta table.") + # TODO: Move to datacheck else: raise exceptions.MetadataUpdateException( f"Species {species_id}: Database contains a Genome.genome_uuid, but corresponding data is not in meta table.") + # TODO: Move to datacheck if self.is_object_new(organism): logger.info(f'Species {species_id}: New organism') - if not self.is_object_new(genebuild_dataset): + if not self.is_object_new(assembly): raise exceptions.MetadataUpdateException( - f"Species {species_id}: New organism, but existing assembly accession and/or genebuild version") + f"Species {species_id}: New organism, but existing assembly accession") + # TODO: Move to datacheck , but leave here to be sure new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, species_id, organism, @@ -220,13 +266,8 @@ def process_species(self, species_id, meta_session): genebuild_dataset) self.concurrent_commit_genome_uuid(meta_session, species_id, new_genome.genome_uuid) - - elif self.is_object_new(assembly): logger.info(f'Species {species_id}: New assembly') - if not self.is_object_new(genebuild_dataset): - raise exceptions.MetadataUpdateException( - f"Species {species_id}: New assembly, but existing genebuild version") new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, species_id, organism, @@ -237,32 +278,18 @@ def process_species(self, species_id, meta_session): # Create genome and populate the database with assembly and dataset - elif self.is_object_new(genebuild_dataset): - # Check that genest update or provider name has changed from last time. - - dataset_attr_alias1 = aliased(DatasetAttribute) - attribute_alias1 = aliased(Attribute) - dataset_attr_alias2 = aliased(DatasetAttribute) - attribute_alias2 = aliased(Attribute) + else: provider_name = self.get_meta_single_meta_key(species_id, "genebuild.provider_name") geneset_update = self.get_meta_single_meta_key(species_id, "genebuild.last_geneset_update") - query = meta_session.query(Assembly).join( - Genome, Assembly.genomes - ).join(GenomeDataset, Genome.genome_datasets - ).join(Dataset, GenomeDataset.dataset - ).join(dataset_attr_alias1, Dataset.dataset_attributes - ).join(attribute_alias1, dataset_attr_alias1.attribute - ).join(dataset_attr_alias2, Dataset.dataset_attributes - ).join(attribute_alias2, dataset_attr_alias2.attribute - ).filter(Assembly.accession == assembly.accession, - Dataset.dataset_type.has(name="genebuild"), - and_( - attribute_alias1.name == "genebuild.provider_name", - dataset_attr_alias1.value == provider_name, - attribute_alias2.name == "genebuild.last_geneset_update", - dataset_attr_alias2.value == geneset_update - ) - ) + + query = meta_session.query(Genome).join( + Assembly, Genome.assembly + ).filter( + Assembly.accession == assembly.accession, + Genome.provider_name == provider_name, + Genome.genebuild_date == geneset_update + ) + if meta_session.query(query.exists()).scalar(): raise exceptions.MetadataUpdateException( "genebuild.provider_name or genebuild.last_geneset_update must be updated.") @@ -277,15 +304,7 @@ def process_species(self, species_id, meta_session): self.concurrent_commit_genome_uuid(meta_session, species_id, new_genome.genome_uuid) - else: - # Check if the data has been released - if check_release_status(self.metadata_db, genebuild_dataset.dataset_uuid): - raise exceptions.WrongReleaseException( - f"Species {species_id}: Existing Organism, Assembly, and Datasets within a release.") - else: - logger.info(f'Species {species_id}: Rewrite of existing datasets attempted') - raise exceptions.MetadataUpdateException( - f"Species {species_id}: This looks like a reload of data that hasn't been released.") + def concurrent_commit_genome_uuid(self, meta_session, species_id, genome_uuid): # Currently impossible with myisam without two phase commit (requires full refactor) @@ -316,10 +335,12 @@ def concurrent_commit_genome_uuid(self, meta_session, species_id, genome_uuid): def new_genome(self, meta_session, species_id, organism, assembly, assembly_dataset, genebuild_dataset): production_name = self.get_meta_single_meta_key(species_id, "organism.production_name") - genebuild_version = self.get_meta_single_meta_key(species_id, "genebuild.version") genebuild_date = self.get_meta_single_meta_key(species_id, "genebuild.last_geneset_update") - if genebuild_date is None: - raise exceptions.MetadataUpdateException(f"Unable to parse genebuild.last_geneset_update from meta") + url_name = self.get_meta_single_meta_key(species_id, "assembly.url_name") + provider_name = self.get_meta_single_meta_key(species_id, "genebuild.provider_name") + annotation_source = self.get_meta_single_meta_key(species_id, "genebuild.annotation_source") + if None in (production_name, genebuild_date, annotation_source, provider_name): + raise exceptions.MetadataUpdateException(f"Unable to find required keys from meta") # get next release inline to attach the genome to planned_release = get_or_new_release(self.metadata_uri) new_genome = Genome( @@ -327,10 +348,11 @@ def new_genome(self, meta_session, species_id, organism, assembly, assembly_data assembly=assembly, organism=organism, genebuild_date=genebuild_date, - genebuild_version=genebuild_version, created=func.now(), - is_best=0, production_name=production_name, + url_name=url_name, + annotation_source=annotation_source, + provider_name=provider_name, ) logger.debug(f"Assigning genome {new_genome.genome_uuid} to {planned_release.version}") meta_session.add(new_genome) @@ -351,6 +373,10 @@ def new_genome(self, meta_session, species_id, organism, assembly, assembly_data is_current=True, ) meta_session.add(genebuild_genome_dataset) + + self._create_genome_group_members(meta_session, species_id, new_genome, planned_release) + + # Homology dataset creation homology_uuid, homology_dataset, homology_dataset_attributes, homology_genome_dataset = self.new_homology( meta_session, species_id, genome=new_genome) @@ -364,6 +390,46 @@ def new_genome(self, meta_session, species_id, organism, assembly, assembly_data return new_genome, assembly_genome_dataset, genebuild_genome_dataset + def _create_genome_group_members(self, meta_session, species_id, new_genome, planned_release): + """ + Add genome to genome groups specified in meta keys. + + Args: + meta_session: The metadata database session + species_id: The species ID from the core database + new_genome: The newly created Genome object + planned_release: The EnsemblRelease object + + Raises: + MetadataUpdateException: If a specified genome group doesn't exist + """ + genome_group_names = self.get_meta_all_values(species_id, "genome.genome_group") + + if not genome_group_names: + return None + + for group_name in genome_group_names: + # Check if the genome group exists + genome_group = meta_session.query(GenomeGroup).filter( + GenomeGroup.name == group_name + ).one_or_none() + + if genome_group is None: + raise exceptions.MetadataUpdateException( + f"Genome group '{group_name}' specified in meta key 'genome.genome_group' does not exist in the database" + ) + + # Create GenomeGroupMember + genome_group_member = GenomeGroupMember( + genome=new_genome, + genome_group=genome_group, + ensembl_release=planned_release, + is_current=1, + is_reference=0, + ) + meta_session.add(genome_group_member) + logger.info(f"Added genome {new_genome.genome_uuid} to genome group '{group_name}'") + def get_or_new_organism(self, species_id, meta_session): """ Get an existing Organism instance or create a new one, depending on the information from the metadata database. @@ -373,6 +439,8 @@ def get_or_new_organism(self, species_id, meta_session): biosample_id = self.get_meta_single_meta_key(species_id, "organism.biosample_id") if biosample_id is None: biosample_id = self.get_meta_single_meta_key(species_id, "organism.production_name") + tol_id = self.get_meta_single_meta_key(species_id, "assembly.tol_id") # This one should be deleted eventually. + tol_id = self.get_meta_single_meta_key(species_id, "organism.tol_id") # Getting the common name from the meta table, otherwise we grab it from ncbi. common_name = self.get_meta_single_meta_key(species_id, "organism.common_name") @@ -380,7 +448,7 @@ def get_or_new_organism(self, species_id, meta_session): if taxid is None: raise exceptions.MissingMetaException("organism.taxid is required") if common_name is None: - with self.metadata_db.session_scope() as session: + with self.taxonomy_db.session_scope() as session: common_name = session.query(NCBITaxaName).filter( NCBITaxaName.taxon_id == taxid, NCBITaxaName.name_class == "genbank common name" @@ -401,8 +469,9 @@ def get_or_new_organism(self, species_id, meta_session): scientific_name=self.get_meta_single_meta_key(species_id, "organism.scientific_name"), biosample_id=biosample_id, strain=self.get_meta_single_meta_key(species_id, "organism.strain"), - strain_type=self.get_meta_single_meta_key(species_id, "organism.type"), - scientific_parlance_name=self.get_meta_single_meta_key(species_id, "organism.scientific_parlance_name") + strain_type=self.get_meta_single_meta_key(species_id, "organism.strain_type"), + scientific_parlance_name=self.get_meta_single_meta_key(species_id, "organism.scientific_parlance_name"), + tol_id=tol_id, ) # Query the metadata database to find if an Organism with the same Ensembl name already exists. @@ -416,7 +485,7 @@ def get_or_new_organism(self, species_id, meta_session): # If no existing Organism is found, conduct additional checks before creating a new one. # Check if the new organism's taxonomy ID exists in the taxonomy database. - with self.metadata_db.session_scope() as session: + with self.taxonomy_db.session_scope() as session: try: Taxonomy.fetch_node_by_id(session, new_organism.taxonomy_id) except NoResultFound: @@ -435,16 +504,19 @@ def get_or_new_organism(self, species_id, meta_session): def get_assembly_sequences(self, species_id, assembly): """ - Get the assembly sequences and the values that correspond to the metadata table + Get the assembly sequences and aliases from the core DB. + Returns both AssemblySequence and SequenceAlias objects. """ assembly_sequences = [] + sequence_aliases = [] + with self.db.session_scope() as session: circular_seq_attrib = aliased(SeqRegionAttrib) results = (session.query(SeqRegion.name, SeqRegion.length, CoordSystem.name.label("coord_system_name"), SeqRegionSynonym.synonym, circular_seq_attrib.value.label("is_circular")) .outerjoin(SeqRegion.coord_system) .outerjoin(SeqRegionSynonym, SeqRegionSynonym.seq_region_id == SeqRegion.seq_region_id) - .join(SeqRegion.seq_region_attrib) # For other attributes + .join(SeqRegion.seq_region_attrib) .outerjoin(circular_seq_attrib, and_(circular_seq_attrib.seq_region_id == SeqRegion.seq_region_id, circular_seq_attrib.attrib_type.has(code="circular_seq"))) @@ -453,73 +525,64 @@ def get_assembly_sequences(self, species_id, assembly): .filter(AttribType.code == "toplevel") .filter(CoordSystem.name != "lrg") .all()) + attributes = (session.query(SeqRegion.name, AttribType.code, SeqRegionAttrib.value) .select_from(SeqRegion) .join(SeqRegionAttrib) .join(AttribType) .filter(or_(AttribType.code == "sequence_location", AttribType.code == "karyotype_rank")).all()) + attribute_dict = {} for name, code, value in attributes: if name not in attribute_dict: attribute_dict[name] = {} attribute_dict[name][code] = value + # Single pass: collect synonyms AND process sequence info + synonym_dict = defaultdict(list) accession_info = defaultdict( - # The None's here are improper, but they break far too much for this update if they are changed. - # When accession is decided I will fix them. - # TODO: Just delete the comment. No one cares about the assembly sequence table. lambda: { - "names": set(), "accession": None, "length": None, "location": None, "chromosomal": None, - "karyotype_rank": None + "length": None, + "location": None, + "chromosomal": None, + "karyotype_rank": None, + "type": None, + "is_circular": 0, }) + location_mapping = { + "nuclear_chromosome": "SO:0000738", + "mitochondrial_chromosome": "SO:0000737", + "chloroplast_chromosome": "SO:0000745", + "apicoplast_chromosome": "SO:0001259", + None: "SO:0000738", + } + for seq_region_name, seq_region_length, coord_system_name, synonym, is_circular in results: - accession_info[seq_region_name]["names"].add(seq_region_name) if synonym: - accession_info[seq_region_name]["names"].add(synonym) - - # Save the sequence location, length, and chromosomal flag. - location_mapping = { - 'nuclear_chromosome': 'SO:0000738', - 'mitochondrial_chromosome': 'SO:0000737', - 'chloroplast_chromosome': 'SO:0000745', - 'apicoplast_chromosome': 'SO:0001259', - None: 'SO:0000738', - } - # Try to get the sequence location - location = attribute_dict.get(seq_region_name, {}).get("sequence_location", None) - - # Using the retrieved location to get the sequence location - sequence_location = location_mapping[location] - - # Try to get the karyotype rank - karyotype_rank = attribute_dict.get(seq_region_name, {}).get("karyotype_rank", None) - - # Test if chromosomal: - if karyotype_rank is not None: - chromosomal = 1 - else: - chromosomal = 1 if coord_system_name == "chromosome" else 0 - - # Assign the values to the dictionary - if not accession_info[seq_region_name]["length"]: - accession_info[seq_region_name]["length"] = seq_region_length + synonym_dict[seq_region_name].append(synonym) - if not accession_info[seq_region_name]["location"]: - accession_info[seq_region_name]["location"] = sequence_location + if accession_info[seq_region_name]["length"] is None: + location = attribute_dict.get(seq_region_name, {}).get("sequence_location", None) + sequence_location = location_mapping[location] + karyotype_rank = attribute_dict.get(seq_region_name, {}).get("karyotype_rank", None) - if accession_info[seq_region_name]["chromosomal"] is None: # Assuming default is None - accession_info[seq_region_name]["chromosomal"] = chromosomal + chromosomal = 1 if karyotype_rank is not None else (1 if coord_system_name == "chromosome" else 0) - if not accession_info[seq_region_name]["karyotype_rank"]: - accession_info[seq_region_name]["karyotype_rank"] = karyotype_rank + accession_info[seq_region_name].update({ + "length": seq_region_length, + "location": sequence_location, + "chromosomal": chromosomal, + "karyotype_rank": karyotype_rank, + "type": coord_system_name, + "is_circular": 1 if is_circular == "1" else 0 + }) - accession_info[seq_region_name]["type"] = coord_system_name - accession_info[seq_region_name]["is_circular"] = 1 if is_circular == "1" else 0 + for seq_region_name, info in accession_info.items(): + # Determine the proper accession + accession = self._get_valid_accession(seq_region_name, synonym_dict.get(seq_region_name, [])) - for accession, info in accession_info.items(): - seq_region_name = accession assembly_sequence = AssemblySequence( name=seq_region_name, assembly=assembly, @@ -528,138 +591,333 @@ def get_assembly_sequences(self, species_id, assembly): length=info["length"], sequence_location=info["location"], chromosome_rank=info["karyotype_rank"], - # md5="", Populated after checksums are ran. - # sha512t4u="", Populated after checksums are ran. type=info["type"], is_circular=info["is_circular"] ) - assembly_sequences.append(assembly_sequence) - return assembly_sequences + + # Create SequenceAlias objects for each synonym + for synonym in synonym_dict.get(seq_region_name, []): + sequence_alias = SequenceAlias( + assembly_sequence=assembly_sequence, + alias=synonym, + source="core" + ) + sequence_aliases.append(sequence_alias) + + return assembly_sequences, sequence_aliases + + + def _is_valid_ena_accession(self, identifier): + """ + Check if an identifier matches ENA sequence identifier rules for annotated sequences. + + Valid patterns: + - [A-Z]{1}[0-9]{5}.[0-9]+ + - [A-Z]{2}[0-9]{6}.[0-9]+ + - [A-Z]{2}[0-9]{8} + - [A-Z]{4}[0-9]{2}S?[0-9]{6,8} + - [A-Z]{6}[0-9]{2}S?[0-9]{7,9} + + Returns: + bool: True if identifier matches any pattern + """ + ENA_ACCESSION_PATTERNS = [ + re.compile(r'^[A-Z]{1}[0-9]{5}\.[0-9]+$'), + re.compile(r'^[A-Z]{2}[0-9]{6}\.[0-9]+$'), + re.compile(r'^[A-Z]{2}[0-9]{8}$'), + re.compile(r'^[A-Z]{4}[0-9]{2}S?[0-9]{6,8}$'), + re.compile(r'^[A-Z]{6}[0-9]{2}S?[0-9]{7,9}$'), + ] + return any(pattern.match(identifier) for pattern in ENA_ACCESSION_PATTERNS) + + def _get_valid_accession(self, seq_region_name, synonyms): + """ + Get a valid ENA accession for a sequence region. + + First checks if the seq_region_name matches ENA rules. + If not, searches through synonyms for the first match. + + Args: + seq_region_name: The sequence region name from core DB + synonyms: List of synonyms for this sequence region + + Returns: + str: Valid ENA accession + + Raises: + MetadataUpdateException: If no valid accession found + """ + if self._is_valid_ena_accession(seq_region_name): + return seq_region_name + + # Search through synonyms for the first valid accession + # TODO: Make this match the assembly report instead of taking first match + for synonym in synonyms: + if self._is_valid_ena_accession(synonym): + return synonym + + raise exceptions.MetadataUpdateException( + f"No sequence accession found that matches ENA identifier rules for sequence '{seq_region_name}'. " + f"Checked name and {len(synonyms)} synonym(s): {synonyms}" + ) def get_or_new_assembly(self, species_id, meta_session, source=None): - # Get the new assembly accession from the core handed over + """ + Queries the existing metadata to see if the assembly exists and determines + whether to attach to existing, create new, or return an error. + + Handles multiple assemblies with same accession by comparing sequences. + Excludes assemblies with FAULTY dataset status. + """ + assembly_accession = self.get_meta_single_meta_key(species_id, "assembly.accession") - assembly = meta_session.query(Assembly).filter(Assembly.accession == assembly_accession).one_or_none() + # Query assemblies but exclude those with faulty assembly datasets + assemblies = (meta_session.query(Assembly) + .outerjoin(Genome, Genome.assembly_id == Assembly.assembly_id) + .outerjoin(GenomeDataset, GenomeDataset.genome_id == Genome.genome_id) + .outerjoin(Dataset, Dataset.dataset_id == GenomeDataset.dataset_id) + .outerjoin(DatasetType, Dataset.dataset_type_id == DatasetType.dataset_type_id) + .filter(Assembly.accession == assembly_accession) + .filter(or_( + DatasetType.name != "assembly", + Dataset.status != DatasetStatus.FAULTY + )).distinct().all()) if source is None: dataset_source = self.get_or_new_source(meta_session, "core") else: dataset_source = source - # This should return the existing objects - if assembly is not None: - # Get the existing assembly dataset - assembly_dataset = meta_session.query(Dataset).filter(Dataset.label == assembly_accession).one_or_none() - # I should not need this, but double check on database updating. - assembly_dataset_attributes = assembly_dataset.dataset_attributes - assembly_sequences = assembly.assembly_sequences - return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source - else: - attributes = self.get_meta_list_from_prefix_meta_key(species_id, "assembly") - is_reference = 1 if self.get_meta_single_meta_key(species_id, "assembly.is_reference") else 0 - with self.db.session_scope() as session: - level = (session.execute(db.select(CoordSystem.name).filter( - CoordSystem.species_id == species_id).order_by(CoordSystem.rank)).all())[0][0] - tol_id = self.get_meta_single_meta_key(species_id, "assembly.tol_id") - accession_body = self.get_meta_single_meta_key(species_id, - "assembly.accession_body") if self.get_meta_single_meta_key( - species_id, "assembly.accession_body") else "INSDC" - assembly = Assembly( - ucsc_name=self.get_meta_single_meta_key(species_id, "assembly.ucsc_alias"), - accession=self.get_meta_single_meta_key(species_id, "assembly.accession"), - level=level, - name=self.get_meta_single_meta_key(species_id, "assembly.name"), - accession_body=accession_body, - assembly_default=self.get_meta_single_meta_key(species_id, "assembly.default"), - tol_id=tol_id, - alt_accession=self.get_meta_single_meta_key(species_id, "assembly.alt_accession"), - created=func.now(), - assembly_uuid=str(uuid.uuid4()), - url_name=self.get_meta_single_meta_key(species_id, "assembly.url_name"), - is_reference=is_reference + # Query core DB once upfront - get names and count together + incoming_seq_names = self._get_incoming_sequence_names(species_id) + incoming_count = len(incoming_seq_names) + + # Case 1: New assembly accession - Fresh load + if not assemblies: + return self._create_new_assembly(species_id, meta_session, dataset_source, assembly_accession) + + # Check for force new UUID flag + force_new_uuid = self.get_meta_single_meta_key(species_id, "assembly.create_new_uuid") + + # Find assemblies that match on sequence count and names + matching_assembly = self._find_matching_assembly(assemblies, incoming_seq_names, incoming_count) + + # Case 2: Found exact match - Attach to existing + if matching_assembly is not None: + return self._attach_to_existing_assembly(matching_assembly, meta_session, assembly_accession, + dataset_source) + + # No exact match found - either error or force new + if int(force_new_uuid) == 1: + return self._create_new_assembly(species_id, meta_session, dataset_source, assembly_accession) + + # Return error describing discrepancies + error_details = self._generate_discrepancy_error(assemblies, incoming_seq_names, incoming_count) + raise exceptions.MetadataUpdateException(f"Assembly mismatch: {error_details}") + + def _find_matching_assembly(self, assemblies, incoming_names, incoming_count): + """ + Find an assembly that matches both sequence count and names. + Uses pre-fetched incoming data to avoid redundant queries. + + Returns: + Assembly or None: The matching assembly if found, None otherwise + """ + # Filter to assemblies with matching count + count_matches = [a for a in assemblies if len(a.assembly_sequences) == incoming_count] + + # From those, find one with matching names + for assembly in count_matches: + existing_names = {seq.name for seq in assembly.assembly_sequences} + if existing_names == incoming_names: + return assembly + return None + + def _get_incoming_sequence_names(self, species_id): + """ + Get the names of top-level sequences from the core DB. + Single query to avoid redundancy. + + Returns: + set: Set of sequence names + """ + with self.db.session_scope() as session: + results = (session.query(SeqRegion.name) + .join(SeqRegion.coord_system) + .join(SeqRegion.seq_region_attrib) + .join(SeqRegionAttrib.attrib_type) + .filter(CoordSystem.species_id == species_id) + .filter(AttribType.code == "toplevel") + .filter(CoordSystem.name != "lrg") + .all()) + return {name for (name,) in results} + + def _generate_discrepancy_error(self, assemblies, incoming_names, incoming_count): + """ + Generate a detailed error message describing why no match was found. + """ + count_matching_assemblies = [a for a in assemblies if len(a.assembly_sequences) == incoming_count] + + if not count_matching_assemblies: + # No count matches + assembly_info = [(a.assembly_uuid, len(a.assembly_sequences)) for a in assemblies] + counts_str = ", ".join([f"UUID {uuid}: {count} sequences" for uuid, count in assembly_info]) + return (f"Assembly accession found {len(assemblies)} time(s) in database, " + f"but none match incoming sequence count of {incoming_count}. " + f"Existing counts: {counts_str}") + + # Count matches but names don't + error_lines = [ + f"Assembly accession found with matching sequence count ({incoming_count}), " + f"but sequence names do not match.", + f"Incoming names: {sorted(incoming_names)}" + ] + + for assembly in count_matching_assemblies: + existing_names = {seq.name for seq in assembly.assembly_sequences} + missing = incoming_names - existing_names + extra = existing_names - incoming_names + + error_lines.append(f"\nUUID {assembly.assembly_uuid}: {sorted(existing_names)}") + if missing: + error_lines.append(f" Missing in existing: {sorted(missing)}") + if extra: + error_lines.append(f" Extra in existing: {sorted(extra)}") + + return "\n".join(error_lines) + + def _attach_to_existing_assembly(self, assembly, meta_session, assembly_accession, dataset_source): + """Attach to existing assembly when sequences match.""" + # Find the assembly dataset through the relationship path + # Assembly -> Genome -> GenomeDataset -> Dataset + assembly_dataset = (meta_session.query(Dataset) + .join(GenomeDataset, GenomeDataset.dataset_id == Dataset.dataset_id) + .join(Genome, Genome.genome_id == GenomeDataset.genome_id) + .join(DatasetType, Dataset.dataset_type_id == DatasetType.dataset_type_id) + .filter(Genome.assembly_id == assembly.assembly_id) + .filter(DatasetType.name == "assembly") + .filter(Dataset.status != DatasetStatus.FAULTY) + .first()) + + if assembly_dataset is None: + raise exceptions.MetadataUpdateException( + f"Assembly {assembly_accession} exists but no valid (non-faulty) assembly dataset found" ) - dataset_factory = DatasetFactory(self.metadata_uri) - dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "assembly").first() - (dataset_uuid, assembly_dataset, assembly_dataset_attributes, - new_genome_dataset) = dataset_factory.create_dataset(meta_session, None, dataset_source, - dataset_type, attributes, "assembly", - assembly.accession, None, - DatasetStatus.PROCESSED) - meta_session.add(assembly) - meta_session.add(assembly_dataset) - assembly_sequences = self.get_assembly_sequences(species_id, assembly) - meta_session.add_all(assembly_sequences) - - meta_session.add_all(assembly_dataset_attributes) - return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source - - def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=False): + + assembly_dataset_attributes = assembly_dataset.dataset_attributes + assembly_sequences = assembly.assembly_sequences + return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source + + def _create_new_assembly(self, species_id, meta_session, dataset_source, assembly_accession): + """Create a new assembly with unique UUID.""" + attributes = self.get_meta_list_from_prefix_meta_key(species_id, "assembly") + is_reference = 1 if self.get_meta_single_meta_key(species_id, "assembly.is_reference") else 0 + + with self.db.session_scope() as session: + level = (session.execute(db.select(CoordSystem.name).filter( + CoordSystem.species_id == species_id).order_by(CoordSystem.rank)).all())[0][0] + accession_body = self.get_meta_single_meta_key(species_id, "assembly.accession_body") + if not accession_body: + accession_body = "INSDC" + + assembly = Assembly( + ucsc_name=self.get_meta_single_meta_key(species_id, "assembly.ucsc_alias"), + accession=assembly_accession, + level=level, + name=self.get_meta_single_meta_key(species_id, "assembly.name"), + accession_body=accession_body, + assembly_default=self.get_meta_single_meta_key(species_id, "assembly.default"), + created=func.now(), + assembly_uuid=str(uuid.uuid4()), + is_reference=is_reference, + ) + + dataset_factory = DatasetFactory(self.metadata_uri) + dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "assembly").first() + (dataset_uuid, assembly_dataset, assembly_dataset_attributes, + new_genome_dataset) = dataset_factory.create_dataset(meta_session, None, dataset_source, + dataset_type, attributes, "assembly", + assembly.accession, None, + DatasetStatus.PROCESSED) + + meta_session.add(assembly) + meta_session.add(assembly_dataset) + + # Get assembly sequences AND aliases + assembly_sequences, sequence_aliases = self.get_assembly_sequences(species_id, assembly) + + meta_session.add_all(assembly_sequences) + meta_session.add_all(sequence_aliases) + meta_session.add_all(assembly_dataset_attributes) + + return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source + + def _create_genebuild(self, species_id, meta_session, source=None): """ - Process an individual species from a core database to update the metadata db. - This method contains the logic for updating the metadata - This is not a get, as we don't update the metadata for genebuild, only replace it if it is not released. + Create a new genebuild dataset for a species from a core database. + This method always creates a new dataset - if a matching genome already exists, it throws an exception. + The uniqueness is enforced at the Genome level (assembly + provider + genebuild_date). """ assembly_accession = self.get_meta_single_meta_key(species_id, "assembly.accession") - genebuild_version = self.get_meta_single_meta_key(species_id, "genebuild.version") provider_name = self.get_meta_single_meta_key(species_id, "genebuild.provider_name") last_geneset_update = self.get_meta_single_meta_key(species_id, "genebuild.last_geneset_update") - - provider_name_attr = aliased(DatasetAttribute, name="provider_name_attr") - last_geneset_update_attr = aliased(DatasetAttribute, name="last_geneset_update_attr") - - # Query for an existing combination + annotation_source = self.get_meta_single_meta_key(species_id, "genebuild.annotation_source") + # Query for an existing combination - this is our uniqueness check + # If this exists, we should NOT create a new one existing_combination = ( meta_session.query(Genome.genome_id) - .join(GenomeDataset, Genome.genome_id == GenomeDataset.genome_id) - .join(Dataset, GenomeDataset.dataset_id == Dataset.dataset_id) .join(Assembly, Genome.assembly_id == Assembly.assembly_id) - .join(provider_name_attr, Dataset.dataset_id == provider_name_attr.dataset_id) - .join(last_geneset_update_attr, Dataset.dataset_id == last_geneset_update_attr.dataset_id) .filter( - Dataset.name == "genebuild", Assembly.accession == assembly_accession, - provider_name_attr.value == provider_name, - last_geneset_update_attr.value == last_geneset_update, - provider_name_attr.attribute.has(Attribute.name == "genebuild.provider_name"), - last_geneset_update_attr.attribute.has(Attribute.name == "genebuild.last_geneset_update"), + Genome.provider_name == provider_name, + Genome.genebuild_date == last_geneset_update, ) ) test_for_existing = meta_session.query(existing_combination.exists()).scalar() - # Check if the combination exists if test_for_existing: raise exceptions.MetaException( - "genebuild.provider_name, genebuild.last_geneset_update, and assembly.accession cannot match existing records." + f"Genebuild already exists for assembly {assembly_accession} " + f"with provider '{provider_name}' and date '{last_geneset_update}'. " + "Cannot create duplicate genebuild." ) - # The genebuild accession is formed by combining the assembly accession and the genebuild version - genebuild_accession = assembly_accession + "_" + genebuild_version + # Check for conflicting annotation source + # This isn't persay a strict requirment but it will make the FTP confusing as hell if we allow it. + conflicting_combination = ( + meta_session.query(Genome.genome_id) + .join(Assembly, Genome.assembly_id == Assembly.assembly_id) + .filter( + Assembly.accession == assembly_accession, + Genome.provider_name != provider_name, + Genome.annotation_source == annotation_source, + ) + ) + + test_for_conflicting = meta_session.query(conflicting_combination.exists()).scalar() + if test_for_conflicting: + raise exceptions.MetaException( + f"Genebuild already exists for assembly {assembly_accession} " + f"existing genebuild with different provider uses an annotation source of '{annotation_source}'. " + "Please use a different one." + ) + genebuild_label = f"{assembly_accession}_{provider_name}_{last_geneset_update}" + if source is None: dataset_source = self.get_or_new_source(meta_session, "core") else: dataset_source = source dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "genebuild").first() - test_status = meta_session.query(Dataset).filter(Dataset.label == genebuild_accession).one_or_none() - - # Return existing data if no update is required - if test_status is not None and existing is False: - genebuild_dataset = test_status - genebuild_dataset_attributes = genebuild_dataset.dataset_attributes - return genebuild_dataset, genebuild_dataset_attributes attributes = self.get_meta_list_from_prefix_meta_key(species_id, "genebuild.") - if existing is False: - dataset_factory = DatasetFactory(self.metadata_uri) - (dataset_uuid, genebuild_dataset, genebuild_dataset_attributes, - new_genome_dataset) = dataset_factory.create_dataset(meta_session, None, dataset_source, - dataset_type, attributes, "genebuild", - genebuild_accession, genebuild_version) - else: - genebuild_dataset = existing - genebuild_dataset.label = genebuild_accession - genebuild_dataset.dataset_source = dataset_source - genebuild_dataset.version = genebuild_version - genebuild_dataset_attributes = update_attributes(genebuild_dataset, attributes, meta_session, replace=True) + dataset_version = last_geneset_update + dataset_factory = DatasetFactory(self.metadata_uri) + (dataset_uuid, genebuild_dataset, genebuild_dataset_attributes, + new_genome_dataset) = dataset_factory.create_dataset( + meta_session, None, dataset_source, + dataset_type, attributes, "genebuild", + genebuild_label, dataset_version + ) return genebuild_dataset, genebuild_dataset_attributes diff --git a/src/ensembl/production/metadata/updater/updater_utils.py b/src/ensembl/production/metadata/updater/updater_utils.py index 1dd5145c..e5e59202 100644 --- a/src/ensembl/production/metadata/updater/updater_utils.py +++ b/src/ensembl/production/metadata/updater/updater_utils.py @@ -14,21 +14,43 @@ def update_attributes(dataset, attributes, session, replace=False): - # TODO If attributes already exist, update them. Add option to replace all. + """ + Update or create dataset attributes. + + Args: + dataset: The dataset object to attach attributes to + attributes: Dictionary of {attribute_name: value} where value can be: + - A single value: "GRCh38" + - A list of values: ["hg38", "Human"] + session: Database session + replace: If True, delete existing attributes before adding new ones + + Returns: + list: List of created DatasetAttribute objects + """ dataset_attributes = [] + if replace: for dataset_attribute in dataset.dataset_attributes: session.delete(dataset_attribute) - session.flush() - for attribute, value in attributes.items(): - meta_attribute = session.query(Attribute).filter(Attribute.name == attribute).one_or_none() + session.flush() + + for attribute_name, attribute_value in attributes.items(): + meta_attribute = session.query(Attribute).filter(Attribute.name == attribute_name).one_or_none() if meta_attribute is None: - raise UpdaterException(f"{attribute} does not exist. Add it to the database and reload.") - new_dataset_attribute = DatasetAttribute( - value=value, - dataset=dataset, - attribute=meta_attribute, - ) - session.add(new_dataset_attribute) - dataset_attributes.append(new_dataset_attribute) + raise UpdaterException(f"{attribute_name} does not exist. Add it to the database and reload.") + + # Normalize to list format + values = attribute_value if isinstance(attribute_value, list) else [attribute_value] + + # Create a DatasetAttribute for each value + for value in values: + new_dataset_attribute = DatasetAttribute( + value=value, + dataset=dataset, + attribute=meta_attribute, + ) + session.add(new_dataset_attribute) + dataset_attributes.append(new_dataset_attribute) + return dataset_attributes \ No newline at end of file diff --git a/src/scripts/update_test_set.py b/src/scripts/update_test_set.py deleted file mode 100644 index 386b4eff..00000000 --- a/src/scripts/update_test_set.py +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env python -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging -import os -import random -from datetime import datetime, timedelta -from typing import List - -from ensembl.utils.database import DBConnection -from sqlalchemy.engine import make_url - -from ensembl.production.metadata.api.factories.datasets import DatasetFactory -from ensembl.production.metadata.api.factories.genomes import GenomeFactory -from ensembl.production.metadata.api.factories.release import ReleaseFactory -from ensembl.production.metadata.api.models import * - -logger = logging.getLogger(__name__) - -gen_factory = GenomeFactory() - - -class MetadataUpdater: - - def __init__(self, metadata_uri): - super().__init__() - self.metadata_uri = metadata_uri - - def check(self): - metadata_db = DBConnection(self.metadata_uri) - with metadata_db.session_scope() as session: - releases = session.query(EnsemblRelease).order_by(EnsemblRelease.version).all() - factory = ReleaseFactory(self.metadata_uri) - [factory.check_release(rel) for rel in releases] - - def wipe(self): - metadata_db = DBConnection(self.metadata_uri) - with metadata_db.session_scope() as session: - dataset_types = session.query(DatasetType.dataset_type_id).filter( - DatasetType.topic.in_(['production_process', 'production_preparation', 'production_publication'])) - delete = session.query(Dataset).filter(Dataset.dataset_type_id.in_(dataset_types)).delete() - session.execute(delete) - - def create_release_ds(self): - metadata_db = DBConnection(self.metadata_uri) - with metadata_db.session_scope() as session: - ds_factory = DatasetFactory(self.metadata_uri) - releases: List[EnsemblRelease] = session.query(EnsemblRelease).all() - for release in releases: - if release.status == ReleaseStatus.RELEASED: - dataset_status = DatasetStatus.RELEASED - topic = None - elif release.status == ReleaseStatus.PREPARING: - dataset_status = DatasetStatus.PROCESSED - topic = ['production_process'] - elif release.status == ReleaseStatus.PREPARED: - dataset_status = DatasetStatus.PROCESSED - topic = ['production_process', 'production_preparation'] - for genome_dataset in release.genome_datasets: - if topic is not None: - for top in topic: - ds_factory.create_all_child_datasets(dataset_uuid=genome_dataset.dataset.dataset_uuid, - topic=top, - session=session, - status=dataset_status, - release=release) - else: - ds_factory.create_all_child_datasets(dataset_uuid=genome_dataset.dataset.dataset_uuid, - topic=None, - session=session, - status=dataset_status, - release=release) - # Randomly assign dates for production datasets expected attributes - datasets = session.query(Dataset, EnsemblRelease).select_from(Dataset).join( - DatasetType.datasets).join(GenomeDataset, GenomeDataset.dataset_id == Dataset.dataset_id).outerjoin( - EnsemblRelease, EnsemblRelease.release_id == GenomeDataset.release_id).filter( - DatasetType.topic.in_(('production_process', 'production_preparation'))).order_by(Dataset.dataset_uuid) - # attribute_id IN(183, 182) - for dataset in datasets.all(): - end = None - start = None - if dataset.Dataset.status == DatasetStatus.RELEASED and dataset.EnsemblRelease.release_date is not None: - logger.info(f"Dataset {dataset.Dataset.dataset_uuid} is released") - start = dataset.EnsemblRelease.release_date - timedelta(weeks=3) - end = dataset.EnsemblRelease.release_date - elif dataset.Dataset.status == DatasetStatus.PROCESSED: - logger.info(f"Dataset {dataset.Dataset.dataset_uuid} is processed") - start = datetime.now() - timedelta(weeks=1) - end = datetime.now() - timedelta(days=1) - elif dataset.Dataset.status == DatasetStatus.PROCESSING: - logger.info(f"Dataset {dataset.Dataset.dataset_uuid} is processing") - start = datetime.now() - timedelta(weeks=1) - end = None - if end: - start_build = start + (end - start) * random.random() - end_build = start_build + timedelta(days=1) - - session.add(DatasetAttribute(dataset_id=dataset.Dataset.dataset_id, - attribute_id=183, - value=datetime.strftime(end_build, "%y/%m/%d"))) - if start: - if not end: - end = datetime.now() - start_build = start + (end - start) * random.random() - session.add(DatasetAttribute(dataset_id=dataset.Dataset.dataset_id, - attribute_id=182, - value=datetime.strftime(start_build, "%y/%m/%d"))) - - def create_submitted_ds(self): - metadata_db = DBConnection(self.metadata_uri) - with metadata_db.session_scope() as session: - ds_factory = DatasetFactory(self.metadata_uri) - datasets = session.query(Dataset).join(GenomeDataset.dataset).filter(GenomeDataset.release_id == None).all() - for dataset in datasets: - ds_factory.create_all_child_datasets(dataset_uuid=dataset.dataset_uuid, - topic='production_process', - session=session, - status=dataset.status) - - -def main(): - parser = argparse.ArgumentParser( - prog='update_test_set.py', - description='Some potential useful methods to update the test set on host' - ) - parser.add_argument('-m', '--metadata_db_uri', type=str, - default="mysql://ensembl@localhost:3306/marco_ensembl_genome_metadata", - required=False, help='Target metadata uri') - - parser.add_argument('--action', type=str, help="Action method to call (check|wipe|create)", - required=False, default='create_submitted_ds') - args = parser.parse_args() - meta_details = make_url(args.metadata_db_uri) - logger.info(f'Connecting Metadata Database with host:{meta_details.host} & dbname:{meta_details.database}') - meta_updater = MetadataUpdater(args.metadata_db_uri) - getattr(meta_updater, args.action)() - - -if __name__ == "__main__": - logger.info('Updating metadata content') - main() diff --git a/src/scripts/updates_metadata.sql b/src/scripts/updates_metadata.sql deleted file mode 100644 index ddd7ef13..00000000 --- a/src/scripts/updates_metadata.sql +++ /dev/null @@ -1,130 +0,0 @@ -#### DB updates to reset datasets -# DELETE non root datasets -delete dataset -from dataset where dataset_type_id > 7; -# Reinsert new dataset_types - -select * from dataset where dataset_type_id > 7; - -delete from dataset_type where dataset_type_id > 7; -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (8, 'genebuild_compute', 'External References', 'production_process', 'Xref genome annotation for Genebuild', null, 2, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (9, 'genebuild_files', 'Files dumps', 'production_process', 'File Dumps, either internal or for public consumption', null, 2, '8', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (11, 'genebuild_web', 'Web Geneset content', 'production_process', 'Web Geneset related content', null, 2, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (12, 'genebuild_prep', 'Genebuild preparation', 'production_preparation', 'Web Content for Geneset publication', null, 2, '8,9,11,12', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (13, 'xrefs', 'External References', 'production_process', 'External annotations linking', null, 8, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (14, 'protein_features', 'Protein Features annotations', 'production_process', 'Proteins annotation', null, 8, '13', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (15, 'alpha_fold', 'AlphaFold computation', 'production_process', 'Compute Protein structure with Alphafold', null, 8, '13', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (16, 'blast', 'Blast tools', 'production_process', 'Blast Indexes files', null, 9, '8', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (17, 'ftp_dumps', 'Public FTP files', 'production_process', 'Public FTP flat files geneset dumps', null, 9, '8', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (18, 'thoas_dumps', 'Thoas load flat files', 'production_process', 'Dump flat file to load onto THOAS', null, 11, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (19, 'thoas_load', 'Thoas MongoDB Load', 'production_preparation', 'Load dumped files onto THOAS', null, 12, '18,23', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (20, 'genebuild_browser_files', 'Genome Browser BB Geneset files', 'production_process', 'Production BigBed for Genome Browser', null, 11, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (21, 'genebuild_track', 'Geneset Tracks API', 'production_preparation', 'Register Geneset Track API BigBed files', null, 12, '20', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (23, 'checksums', 'Sequences Checksums', 'production_process', 'Compute core sequence checksums and update metadata', null, 11, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (24, 'refget_load', 'Refget Loading', 'production_preparation', 'Load sequences and their checksum onto Refget app', null, 12, '22', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (25, 'homology_compute', 'Homology annotation', 'production_process', 'Compute Genome homology analysis', null, 6, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (26, 'homology_load', 'Homology dataload', 'production_preparation', 'Load homology data onto Compara Service (MongoDB)', null, 6, '25', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (27, 'homology_ftp', 'Homology tsv public files', 'production_preparation', 'Dump and sync public TSV homology files', null, 6, '25', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (28, 'vep', 'VEP filesets', 'variation_annotation', 'VCF annotation file for geneset', null, null, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (29, 'variation_ftp', 'Public Variation files (vcf)', 'production_preparation', 'VCF files for public FTP', null, 3, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (31, 'variation_browser_files', 'Variation Browser files', 'production_process', 'Variation track browser file', null, 3, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (32, 'variation_track', 'Variation Track', 'production_preparation', 'Variation Track API', null, 3, '31', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (33, 'regulation_browser_files', 'Regulation Browser files', 'production_process', 'Regulation track browser file', null, 7, null, null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (34, 'regulation_track', 'Regulation Track', 'production_preparation', 'Regulation Track API', null, 7, '33', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (35, 'regulation_ftp', 'Regulation Public files', 'production_preparation', 'Regulation public files', null, 7, '33', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (37, 'web_genesearch', 'GeneSearch Index', 'production_publication', 'Gene search indexes provisioning', null, null, '36', null); -INSERT INTO dataset_type (dataset_type_id, name, label, topic, description, details_uri, parent_id, depends_on, filter_on) VALUES (38, 'web_genomediscovery', 'Genome Search indexes loading to EBI search', 'production_publication', 'Load dumped data from genebuild_web onto EBI Search engine (SpecieSelector)', null, null, '37', null); - -# DELETE Bombus_terristris unlinked dataset -delete -from dataset -where dataset_uuid = '428d2741-2699-48a4-8830-4f808994f512'; - -# RELEASE datasets and genomes for 110.2 -update dataset - join genome_dataset using (dataset_id) -set status = 'Processed' # Change to 'Released' when releasing -where release_id = (select release_id from ensembl_release where version = 110.2); - -# RESET dataset which are not attached to a release -# Mark all Unreleased assembly dataset as 'Processed' -update dataset - join genome_dataset using (dataset_id) -set status = 'Processed' -where release_id is null - and dataset_type_id = (select dataset_type_id - from dataset_type - where dataset.name = 'assembly' - and dataset_type.dataset_type_id = dataset.dataset_type_id); - -# Mark all others as 'Submitted' -update dataset - join genome_dataset using (dataset_id) -set status = 'Submitted' -where release_id is null - and dataset_type_id = (select dataset_type_id - from dataset_type - where dataset.name <> 'assembly' - and dataset_type.dataset_type_id = dataset.dataset_type_id); - -# INSERT compara_homologies from 241 to 110.2 -BEGIN; -INSERT INTO dataset (dataset_uuid, name, version, created, label, dataset_source_id, dataset_type_id, status) -SELECT UUID(), - name, - '2.0', - NOW(), - dataset.label, - dataset.dataset_source_id, - 6, - 'Released' -from dataset - join genome_dataset using (dataset_id) -where dataset_type_id = 6 - and genome_dataset.release_id = 1; - -# INSERT compara_homologies as supplementary dataset for the 241 in next release. -INSERT INTO genome_dataset (is_current, dataset_id, genome_id, release_id) -select 0, - dataset_id, - (select genome.genome_id - from genome - join genome_dataset gd using (genome_id) - join dataset d1 using (dataset_id) - where gd.genome_id in (SELECT genome.genome_id - from genome - join genome_dataset using (genome_id) - join dataset using (dataset_id) - where dataset_type_id = 6 - and genome_dataset.release_id = 1) - and d1.dataset_source_id = d.dataset_source_id) as genebuild_genome_id, - 2 -from dataset d -where version = '2.0'; - -# UPDATE homologies for 110.1 is_current to 0 -UPDATE genome_dataset - join dataset using (dataset_id) -set is_current = 0 -where dataset_type_id = 6 - and release_id = 1; - -# UPDATE homologies for 110.2 is_current to 1 -UPDATE genome_dataset - join dataset using (dataset_id) -set is_current = 1 -where dataset_type_id = 6 - and release_id = 2; - -# UPDATE 110.2 as released -update ensembl_release -set is_current = 0 -where version = 110.1; -update ensembl_release -set status = 'Released', - is_current = 1 -where version = 110.2; -COMMIT; - - - diff --git a/src/tests/README.md b/src/tests/README.md deleted file mode 100644 index e67add79..00000000 --- a/src/tests/README.md +++ /dev/null @@ -1,64 +0,0 @@ -Ensembl Genome Metadata Test Dataset -==================================== - -5 Releases spanning all status ------------------------------- - -| release\_id | version | release\_date | label | is\_current | release\_type | site\_id | status | -| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | -| 1 | 110.1 | 2023-10-18 | MVP Beta-1 | 1 | partial | 1 | Released | -| 2 | 110.2 | null | MVP Beta-2 | 0 | partial | 1 | Prepared | -| 3 | 110.3 | null | MVP Beta-3 | 0 | partial | 1 | Preparing | -| 4 | 112.0 | null | MVP Rel-1 | 0 | integrated | 1 | Planned | -| 5 | 108.0 | 2023-06-15 | First Beta | 0 | partial | 1 | Released | - - -First Beta - Released ---------------------- - -7 initial species present on the first beta public release (mid-2023) - `Released` - -Datasets: -- Datasets all `Released` -- A supplementary compara_homologies `Processed` - Attached to `Beta-2` (see below) - -Beta-1 - Released ------------------ - -3 more humans `Released` - -Datasets: -- Datasets all `Released` (assembly - genebuild - evidence - variation) -- Some with regulatory_features -- A supplementary compara_homologies `Processed` - Attached to `Beta-2` (see below) - -Beta-2 - Prepared ------------------ - -4 more human genomes attached to release - -Datasets: -- All datasets `Processed` (assembly - genebuild - variation - compara_homologies) -- No regulatory_features - -Beta-3 - Preparing ------------------- - -2 more humans attached to release - -Datasets - - - Assembly - Processed - - Genebuild - One Processed / One Processing - - Homologies - Submitted - -Beta-4 - Planed ---------------- - -3 more humans - not attached to any released - -Datasets - -- Assembly - Processed -- Genebuild - Submitted -- Homologies - Submitted \ No newline at end of file diff --git a/src/tests/conftest.py b/src/tests/conftest.py index e01c9bf2..45fb43ac 100644 --- a/src/tests/conftest.py +++ b/src/tests/conftest.py @@ -1,25 +1,12 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Necessary fixtures for our GRPC API tests """ import os +import shutil +import tempfile from pathlib import Path import pytest import sqlalchemy as db from _pytest.config import Config +from ensembl.utils.database import DBConnection from grpc_reflection.v1alpha import reflection from ensembl.production.metadata.api.adaptors import GenomeAdaptor @@ -31,7 +18,64 @@ def pytest_configure(config: Config) -> None: - pytest.dbs_dir = Path(__file__).parent / 'databases' + pytest.dbs_dir = Path(__file__).parent / "databases" + + +@pytest.fixture(scope="module") +def test_dbs(request): + """ + Test database fixture using SQLite databases. + + Uses pre-converted .db files and creates temporary copies for test isolation. + Changes made during tests won't affect the original .db files. + """ + db_configs = request.param if hasattr(request, "param") else [] + test_databases = {} + temp_resources = [] # Track resources for cleanup + + for db_config in db_configs: + src_path = db_config["src"] + db_name = src_path.name + + sqlite_file = src_path.parent / f"{db_name}.db" + + temp_dir = tempfile.mkdtemp(prefix=f"pytest_{db_name}_") + temp_db_file = Path(temp_dir) / f"{db_name}_test.db" + + print(f"\n>>> Using SQLite database: {sqlite_file}") + print(f" (temporary copy: {temp_db_file})") + + try: + shutil.copy2(sqlite_file, temp_db_file) + except FileNotFoundError as exc: + raise FileNotFoundError( + f"SQLite database not found: {sqlite_file}\n" + f"Please convert it first using your conversion script." + ) from exc + + db_url = f"sqlite:///{temp_db_file}" + test_databases[db_name] = type("TestDB", (object,), { + "dbc": DBConnection(db_url), + "drop": lambda: None, # No-op drop method for consistency + })() + + temp_resources.append((temp_db_file, temp_dir)) + + yield test_databases + + for db_name, test_db in test_databases.items(): + if hasattr(test_db.dbc, 'dispose'): + test_db.dbc.dispose() + + for temp_file, temp_dir in temp_resources: + try: + if temp_file.exists(): + temp_file.unlink() + if Path(temp_dir).exists(): + shutil.rmtree(temp_dir) + print(f">>> Cleaned up temporary SQLite copy: {temp_dir}") + except Exception as e: + print(f"Warning: Failed to cleanup {temp_dir}: {e}") @pytest.fixture(scope="module", autouse=True) @@ -45,16 +89,14 @@ def engine(test_dbs): def genome_conn(test_dbs): genome_conn = GenomeAdaptor( metadata_uri=test_dbs["ensembl_genome_metadata"].dbc.url, - taxonomy_uri=test_dbs["ncbi_taxonomy"].dbc.url + taxonomy_uri=test_dbs["ncbi_taxonomy"].dbc.url, ) yield genome_conn + @pytest.fixture(scope="function") def vep_conn(test_dbs): - vep_conn = VepAdaptor( - metadata_uri=test_dbs["ensembl_genome_metadata"].dbc.url, - file="all" - ) + vep_conn = VepAdaptor(metadata_uri=test_dbs["ensembl_genome_metadata"].dbc.url, file="all") yield vep_conn @@ -62,15 +104,14 @@ def vep_conn(test_dbs): def allow_unreleased(request): """Set ALLOWED_UNRELEASED environment variable, this fixture must be used with `parametrize`""" from ensembl.production.metadata.grpc.config import cfg + cfg.allow_unreleased = request.param yield cfg @pytest.fixture(scope="class") def release_conn(test_dbs): - release_conn = ReleaseAdaptor( - metadata_uri=test_dbs["ensembl_genome_metadata"].dbc.url - ) + release_conn = ReleaseAdaptor(metadata_uri=test_dbs["ensembl_genome_metadata"].dbc.url) yield release_conn @@ -84,25 +125,28 @@ def dataset_factory(test_dbs): yield DatasetFactory(test_dbs["ensembl_genome_metadata"].dbc.url) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def grpc_add_to_server(): - from ensembl.production.metadata.grpc.ensembl_metadata_pb2_grpc import add_EnsemblMetadataServicer_to_server + from ensembl.production.metadata.grpc.ensembl_metadata_pb2_grpc import ( + add_EnsemblMetadataServicer_to_server, + ) return add_EnsemblMetadataServicer_to_server -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def grpc_servicer(test_dbs, engine): from ensembl.production.metadata.grpc.servicer import EnsemblMetadataServicer + return EnsemblMetadataServicer() -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def grpc_server(_grpc_server, grpc_addr, grpc_add_to_server, grpc_servicer): grpc_add_to_server(grpc_servicer, _grpc_server) SERVICE_NAMES = ( - ensembl_metadata_pb2.DESCRIPTOR.services_by_name['EnsemblMetadata'].full_name, - reflection.SERVICE_NAME + ensembl_metadata_pb2.DESCRIPTOR.services_by_name["EnsemblMetadata"].full_name, + reflection.SERVICE_NAME, ) reflection.enable_server_reflection(SERVICE_NAMES, _grpc_server) _grpc_server.add_insecure_port(grpc_addr) diff --git a/src/tests/databases/compara_db.db b/src/tests/databases/compara_db.db new file mode 100644 index 00000000..9e05e178 Binary files /dev/null and b/src/tests/databases/compara_db.db differ diff --git a/src/tests/databases/compara_db/genome_db.txt b/src/tests/databases/compara_db/genome_db.txt deleted file mode 100644 index a5e212c7..00000000 --- a/src/tests/databases/compara_db/genome_db.txt +++ /dev/null @@ -1,18 +0,0 @@ -47 6239 caenorhabditis_elegans WBcel235 2014-10 \N strain N2 Caenorhabditis elegans (Nematode, N2) \N 110 \N -30 511145 escherichia_coli_str_k_12_substr_mg1655 ASM584v2 2018-09 \N strain K-12 substr. MG1655 Escherichia coli str. K-12 substr. MG1655 str. K12 (GCA_000005845) \N 110 \N -12 9606 homo_sapiens GRCh38 2023-03 \N \N Human \N 110 \N -11 9606 homo_sapiens_37 GRCh37 2013-09 \N \N Human \N 110 \N -23 9606 homo_sapiens_gca018469415v1 HG03516.alt.pat.f1_v2 2022-07 \N population Esan in Nigeria Homo sapiens (Human) - GCA_018469415.1 \N 110 \N -24 9606 homo_sapiens_gca018469425v1 HG03516.pri.mat.f1_v2 2022-07 \N population Esan in Nigeria Homo sapiens (Human) - GCA_018469425.1 \N 110 \N -31 9606 homo_sapiens_gca018469875v1 HG02622.pri.mat.f1_v2 2022-07 \N population Gambian in Western Division Homo sapiens (Human) - GCA_018469875.1 \N 110 \N -32 9606 homo_sapiens_gca018469925v1 HG02622.alt.pat.f1_v2 2022-07 \N population Gambian in Western Division Homo sapiens (Human) - GCA_018469925.1 \N 110 \N -19 9606 homo_sapiens_gca018473295v1 HG03540.pri.mat.f1_v2 2022-08 \N population Gambian in Western Division Homo sapiens (Human) - GCA_018473295.1 \N 110 \N -21 9606 homo_sapiens_gca018473315v1 HG03540.alt.pat.f1_v2 2022-08 \N population Gambian in Western Division Homo sapiens (Human) - GCA_018473315.1 \N 110 \N -42 9606 homo_sapiens_gca018505825v1 HG02109.pri.mat.f1_v2 2022-07 \N population African from Barbados Homo sapiens (Human) - GCA_018505825.1 \N 110 \N -46 9606 homo_sapiens_gca018505865v1 HG02109.alt.pat.f1_v2 2022-07 \N population African from Barbados Homo sapiens (Human) - GCA_018505865.1 \N 110 \N -6 9606 homo_sapiens_gca018852605v1 HG002.alt.pat.f1_v2 2022-07 \N population European Homo sapiens (Human) - GCA_018852605.1 \N 110 \N -7 9606 homo_sapiens_gca018852615v1 HG002.pri.mat.f1_v2 2022-07 \N population European Homo sapiens (Human) - GCA_018852615.1 \N 110 \N -8 9606 homo_sapiens_gca021950905v1 HG002.pat.cur.20211005 2022-07 \N population European Homo sapiens (Human) - GCA_021950905.1 \N 110 \N -9 9606 homo_sapiens_gca021951015v1 HG002.mat.cur.20211005 2022-07 \N population European Homo sapiens (Human) - GCA_021951015.1 \N 110 \N -16 559292 saccharomyces_cerevisiae R64-1-1 2018-10 \N strain S288C Saccharomyces cerevisiae \N 110 \N -20 4565 triticum_aestivum IWGSC 2018-04-IWGSC \N cultivar Chinese Spring Triticum aestivum \N 110 \N diff --git a/src/tests/databases/compara_db/table.sql b/src/tests/databases/compara_db/table.sql deleted file mode 100644 index eea37ffe..00000000 --- a/src/tests/databases/compara_db/table.sql +++ /dev/null @@ -1,19 +0,0 @@ -CREATE TABLE genome_db ( - genome_db_id INT unsigned NOT NULL AUTO_INCREMENT, # unique internal id - taxon_id INT unsigned DEFAULT NULL, # KF taxon.taxon_id - name varchar(128) DEFAULT '' NOT NULL, - assembly varchar(100) DEFAULT '' NOT NULL, - genebuild varchar(255) DEFAULT '' NOT NULL, - has_karyotype tinyint(1) NOT NULL DEFAULT 0, - is_good_for_alignment TINYINT(1) NOT NULL DEFAULT 0, - genome_component varchar(5) DEFAULT NULL, - strain_name varchar(100) DEFAULT NULL, - display_name varchar(255) DEFAULT NULL, - locator varchar(400), - first_release smallint, - last_release smallint, - - PRIMARY KEY (genome_db_id), - UNIQUE KEY name (name,assembly,genome_component) - -) COLLATE=latin1_swedish_ci ENGINE=MyISAM; \ No newline at end of file diff --git a/src/tests/databases/core_1.db b/src/tests/databases/core_1.db new file mode 100644 index 00000000..8b0c48ba Binary files /dev/null and b/src/tests/databases/core_1.db differ diff --git a/src/tests/databases/core_1/attrib_type.txt b/src/tests/databases/core_1/attrib_type.txt deleted file mode 100644 index 59c569dd..00000000 --- a/src/tests/databases/core_1/attrib_type.txt +++ /dev/null @@ -1,3 +0,0 @@ -6 toplevel Top Level Top Level Non-Redundant Sequence Region -547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". -316 circular_seq Circular sequence Circular chromosome or plasmid molecule \ No newline at end of file diff --git a/src/tests/databases/core_1/coord_system.txt b/src/tests/databases/core_1/coord_system.txt deleted file mode 100644 index 51314bf1..00000000 --- a/src/tests/databases/core_1/coord_system.txt +++ /dev/null @@ -1 +0,0 @@ -1 1 primary_assembly test 1 default_version,sequence_level diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt deleted file mode 100644 index 87844e87..00000000 --- a/src/tests/databases/core_1/meta.txt +++ /dev/null @@ -1,24 +0,0 @@ -12 1 assembly.accession GCF_1111111123.3 -14 1 assembly.default jaber01 -13 1 assembly.name jaber01 -11 1 assembly.ucsc_alias SCARY -15 1 gencode.version 999 -3 1 organism.common_name jabberwocky -7 1 organism.division Ensembl_TEST -6 1 organism.production_name Jabberwocky -4 1 organism.scientific_name carol_jabberwocky -1 1 organism.species_taxonomy_id 10029 -8 1 organism.strain reference -9 1 organism.strain_group testing -2 1 organism.taxonomy_id 10029 -10 1 organism.type monsters -5 1 organism.url Jabbe -17 1 genebuild.version ENS01 -18 1 genebuild.sample_gene ENSAMXG00005000318 -19 1 genebuild.sample_location KB871578.1:9766653-9817473 -20 1 strain.type test -23 1 genebuild.provider_name test -24 1 genebuild.start_date 2023-07-Ensembl -25 1 assembly.alt_accession GCA_0000012345.3 -26 \N schema_version 110 -27 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_1/seq_region.txt b/src/tests/databases/core_1/seq_region.txt deleted file mode 100644 index e67ee5a7..00000000 --- a/src/tests/databases/core_1/seq_region.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 TEST1_seqA 1 666666 -2 TEST2_seqB 1 666 -3 TEST3_seqC 1 1666666 diff --git a/src/tests/databases/core_1/seq_region_attrib.txt b/src/tests/databases/core_1/seq_region_attrib.txt deleted file mode 100644 index aad2591e..00000000 --- a/src/tests/databases/core_1/seq_region_attrib.txt +++ /dev/null @@ -1,8 +0,0 @@ -1 6 1 -2 6 1 -3 6 1 -1 547 nuclear_chromosome -2 547 nuclear_chromosome -3 547 mitochondrial_chromosome -1 316 1 -2 316 0 \ No newline at end of file diff --git a/src/tests/databases/core_1/seq_region_synonym.txt b/src/tests/databases/core_1/seq_region_synonym.txt deleted file mode 100644 index de43d915..00000000 --- a/src/tests/databases/core_1/seq_region_synonym.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 1 TEST1_seq 50710 -2 2 TEST2_seq 50710 -3 3 TEST3_seq 50710 diff --git a/src/tests/databases/core_1/table.sql b/src/tests/databases/core_1/table.sql deleted file mode 100644 index 953da984..00000000 --- a/src/tests/databases/core_1/table.sql +++ /dev/null @@ -1,86 +0,0 @@ -CREATE TABLE coord_system -( - coord_system_id int unsigned auto_increment - primary key, - species_id int unsigned default 1 not null, - name varchar(40) not null, - version varchar(255) null, - `rank` int not null, - attrib set ('default_version', 'sequence_level') null, - constraint name_idx - unique (name, version, species_id), - constraint rank_idx - unique (`rank`, species_id) -); - -CREATE INDEX species_idx - on coord_system (species_id); - -CREATE TABLE meta -( - meta_id int auto_increment - primary key, - species_id int unsigned default 1 null, - meta_key varchar(40) not null, - meta_value varchar(255) not null, - constraint species_key_value_idx - unique (species_id, meta_key, meta_value) -); - -CREATE INDEX species_value_idx - on meta (species_id, meta_value); - -CREATE TABLE seq_region -( - seq_region_id int unsigned auto_increment - primary key, - name varchar(255) not null, - coord_system_id int unsigned not null, - length int unsigned not null, - constraint name_cs_idx - unique (name, coord_system_id) -); - -CREATE INDEX cs_idx - on seq_region (coord_system_id); - -CREATE TABLE seq_region_attrib -( - seq_region_id int unsigned default 0 not null, - attrib_type_id smallint unsigned default 0 not null, - value text not null, - constraint region_attribx - unique (seq_region_id, attrib_type_id, value(500)) -); - -CREATE INDEX seq_region_idx - on seq_region_attrib (seq_region_id); - -CREATE INDEX type_val_idx - on seq_region_attrib (attrib_type_id, value(40)); - -CREATE INDEX val_only_idx - on seq_region_attrib (value(40)); - -CREATE TABLE seq_region_synonym -( - seq_region_synonym_id int unsigned auto_increment - primary key, - seq_region_id int unsigned not null, - synonym varchar(250) not null, - external_db_id int unsigned null, - constraint syn_idx - unique (synonym, seq_region_id) -); - -CREATE INDEX seq_region_idx - on seq_region_synonym (seq_region_id); - -CREATE TABLE `attrib_type` ( - `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, - `code` varchar(20) NOT NULL DEFAULT '', - `name` varchar(255) NOT NULL DEFAULT '', - `description` text, - PRIMARY KEY (`attrib_type_id`), - UNIQUE KEY `code_idx` (`code`) -); \ No newline at end of file diff --git a/src/tests/databases/core_2.db b/src/tests/databases/core_2.db new file mode 100644 index 00000000..edd19394 Binary files /dev/null and b/src/tests/databases/core_2.db differ diff --git a/src/tests/databases/core_2/attrib_type.txt b/src/tests/databases/core_2/attrib_type.txt deleted file mode 100644 index de5f1880..00000000 --- a/src/tests/databases/core_2/attrib_type.txt +++ /dev/null @@ -1,2 +0,0 @@ -6 toplevel Top Level Top Level Non-Redundant Sequence Region -547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_2/coord_system.txt b/src/tests/databases/core_2/coord_system.txt deleted file mode 100644 index 51314bf1..00000000 --- a/src/tests/databases/core_2/coord_system.txt +++ /dev/null @@ -1 +0,0 @@ -1 1 primary_assembly test 1 default_version,sequence_level diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt deleted file mode 100644 index 29b7d700..00000000 --- a/src/tests/databases/core_2/meta.txt +++ /dev/null @@ -1,25 +0,0 @@ -12 1 assembly.accession weird01 -14 1 assembly.default jaber01 -13 1 assembly.name jaber01 -11 1 assembly.ucsc_alias SCARY -15 1 gencode.version 999 -16 1 genebuild.last_geneset_update 01 -3 1 organism.common_name jabberwocky -7 1 organism.division Ensembl_TEST -6 1 organism.production_name Jabberwocky -4 1 organism.scientific_name carol_jabberwocky -1 1 organism.species_taxonomy_id 6666666 -8 1 organism.strain reference -9 1 organism.strain_group testing -2 1 organism.taxonomy_id 666668 -10 1 organism.type monsters -5 1 organism.url Jabbe -17 1 genebuild.version ENS01 -18 1 genebuild.sample_gene ENSAMXG00005000318 -19 1 genebuild.sample_location KB871578.1:9766653-9817473 -20 1 strain.type test -21 1 genome.genome_uuid test -23 1 genebuild.provider_name test2 -24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 -26 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_2/seq_region.txt b/src/tests/databases/core_2/seq_region.txt deleted file mode 100644 index a2216feb..00000000 --- a/src/tests/databases/core_2/seq_region.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 TEST1_seq 1 666666 -2 TEST2_seq 1 666 -3 TEST3_seq 1 1666666 diff --git a/src/tests/databases/core_2/seq_region_attrib.txt b/src/tests/databases/core_2/seq_region_attrib.txt deleted file mode 100644 index d8dcda33..00000000 --- a/src/tests/databases/core_2/seq_region_attrib.txt +++ /dev/null @@ -1,6 +0,0 @@ -1 6 1 -2 6 1 -3 6 1 -1 547 nuclear_chromosome -2 547 nuclear_chromosome -3 547 mitochondrial_chromosome diff --git a/src/tests/databases/core_2/seq_region_synonym.txt b/src/tests/databases/core_2/seq_region_synonym.txt deleted file mode 100644 index de43d915..00000000 --- a/src/tests/databases/core_2/seq_region_synonym.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 1 TEST1_seq 50710 -2 2 TEST2_seq 50710 -3 3 TEST3_seq 50710 diff --git a/src/tests/databases/core_2/table.sql b/src/tests/databases/core_2/table.sql deleted file mode 100644 index 953da984..00000000 --- a/src/tests/databases/core_2/table.sql +++ /dev/null @@ -1,86 +0,0 @@ -CREATE TABLE coord_system -( - coord_system_id int unsigned auto_increment - primary key, - species_id int unsigned default 1 not null, - name varchar(40) not null, - version varchar(255) null, - `rank` int not null, - attrib set ('default_version', 'sequence_level') null, - constraint name_idx - unique (name, version, species_id), - constraint rank_idx - unique (`rank`, species_id) -); - -CREATE INDEX species_idx - on coord_system (species_id); - -CREATE TABLE meta -( - meta_id int auto_increment - primary key, - species_id int unsigned default 1 null, - meta_key varchar(40) not null, - meta_value varchar(255) not null, - constraint species_key_value_idx - unique (species_id, meta_key, meta_value) -); - -CREATE INDEX species_value_idx - on meta (species_id, meta_value); - -CREATE TABLE seq_region -( - seq_region_id int unsigned auto_increment - primary key, - name varchar(255) not null, - coord_system_id int unsigned not null, - length int unsigned not null, - constraint name_cs_idx - unique (name, coord_system_id) -); - -CREATE INDEX cs_idx - on seq_region (coord_system_id); - -CREATE TABLE seq_region_attrib -( - seq_region_id int unsigned default 0 not null, - attrib_type_id smallint unsigned default 0 not null, - value text not null, - constraint region_attribx - unique (seq_region_id, attrib_type_id, value(500)) -); - -CREATE INDEX seq_region_idx - on seq_region_attrib (seq_region_id); - -CREATE INDEX type_val_idx - on seq_region_attrib (attrib_type_id, value(40)); - -CREATE INDEX val_only_idx - on seq_region_attrib (value(40)); - -CREATE TABLE seq_region_synonym -( - seq_region_synonym_id int unsigned auto_increment - primary key, - seq_region_id int unsigned not null, - synonym varchar(250) not null, - external_db_id int unsigned null, - constraint syn_idx - unique (synonym, seq_region_id) -); - -CREATE INDEX seq_region_idx - on seq_region_synonym (seq_region_id); - -CREATE TABLE `attrib_type` ( - `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, - `code` varchar(20) NOT NULL DEFAULT '', - `name` varchar(255) NOT NULL DEFAULT '', - `description` text, - PRIMARY KEY (`attrib_type_id`), - UNIQUE KEY `code_idx` (`code`) -); \ No newline at end of file diff --git a/src/tests/databases/core_3.db b/src/tests/databases/core_3.db new file mode 100644 index 00000000..47a0c92e Binary files /dev/null and b/src/tests/databases/core_3.db differ diff --git a/src/tests/databases/core_3/attrib_type.txt b/src/tests/databases/core_3/attrib_type.txt deleted file mode 100644 index de5f1880..00000000 --- a/src/tests/databases/core_3/attrib_type.txt +++ /dev/null @@ -1,2 +0,0 @@ -6 toplevel Top Level Top Level Non-Redundant Sequence Region -547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_3/coord_system.txt b/src/tests/databases/core_3/coord_system.txt deleted file mode 100644 index 51314bf1..00000000 --- a/src/tests/databases/core_3/coord_system.txt +++ /dev/null @@ -1 +0,0 @@ -1 1 primary_assembly test 1 default_version,sequence_level diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt deleted file mode 100644 index 28ddd53d..00000000 --- a/src/tests/databases/core_3/meta.txt +++ /dev/null @@ -1,23 +0,0 @@ -12 1 assembly.accession weird02 -13 1 assembly.name jaber02 -11 1 assembly.ucsc_alias SCARYIER -14 1 gencode.version 999 -15 1 genebuild.last_geneset_update 2024-02 -3 1 organism.common_name jabberwocky -7 1 organism.division Ensembl_TEST -6 1 organism.production_name Jabberwocky -4 1 organism.scientific_name carol_jabberwocky -1 1 organism.species_taxonomy_id 6666666 -8 1 organism.strain reference -9 1 organism.strain_group testing -2 1 organism.taxonomy_id 666668 -10 1 organism.type monsters -5 1 organism.url Jabbe -17 1 genebuild.version ENS01 -18 1 genebuild.sample_gene ENSAMXG00005000318 -19 1 genebuild.sample_location KB871578.1:9766653-9817473 -20 1 strain.type test -23 1 genebuild.provider_name test -24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 -26 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_3/seq_region.txt b/src/tests/databases/core_3/seq_region.txt deleted file mode 100644 index a2216feb..00000000 --- a/src/tests/databases/core_3/seq_region.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 TEST1_seq 1 666666 -2 TEST2_seq 1 666 -3 TEST3_seq 1 1666666 diff --git a/src/tests/databases/core_3/seq_region_attrib.txt b/src/tests/databases/core_3/seq_region_attrib.txt deleted file mode 100644 index d8dcda33..00000000 --- a/src/tests/databases/core_3/seq_region_attrib.txt +++ /dev/null @@ -1,6 +0,0 @@ -1 6 1 -2 6 1 -3 6 1 -1 547 nuclear_chromosome -2 547 nuclear_chromosome -3 547 mitochondrial_chromosome diff --git a/src/tests/databases/core_3/seq_region_synonym.txt b/src/tests/databases/core_3/seq_region_synonym.txt deleted file mode 100644 index de43d915..00000000 --- a/src/tests/databases/core_3/seq_region_synonym.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 1 TEST1_seq 50710 -2 2 TEST2_seq 50710 -3 3 TEST3_seq 50710 diff --git a/src/tests/databases/core_3/table.sql b/src/tests/databases/core_3/table.sql deleted file mode 100644 index 953da984..00000000 --- a/src/tests/databases/core_3/table.sql +++ /dev/null @@ -1,86 +0,0 @@ -CREATE TABLE coord_system -( - coord_system_id int unsigned auto_increment - primary key, - species_id int unsigned default 1 not null, - name varchar(40) not null, - version varchar(255) null, - `rank` int not null, - attrib set ('default_version', 'sequence_level') null, - constraint name_idx - unique (name, version, species_id), - constraint rank_idx - unique (`rank`, species_id) -); - -CREATE INDEX species_idx - on coord_system (species_id); - -CREATE TABLE meta -( - meta_id int auto_increment - primary key, - species_id int unsigned default 1 null, - meta_key varchar(40) not null, - meta_value varchar(255) not null, - constraint species_key_value_idx - unique (species_id, meta_key, meta_value) -); - -CREATE INDEX species_value_idx - on meta (species_id, meta_value); - -CREATE TABLE seq_region -( - seq_region_id int unsigned auto_increment - primary key, - name varchar(255) not null, - coord_system_id int unsigned not null, - length int unsigned not null, - constraint name_cs_idx - unique (name, coord_system_id) -); - -CREATE INDEX cs_idx - on seq_region (coord_system_id); - -CREATE TABLE seq_region_attrib -( - seq_region_id int unsigned default 0 not null, - attrib_type_id smallint unsigned default 0 not null, - value text not null, - constraint region_attribx - unique (seq_region_id, attrib_type_id, value(500)) -); - -CREATE INDEX seq_region_idx - on seq_region_attrib (seq_region_id); - -CREATE INDEX type_val_idx - on seq_region_attrib (attrib_type_id, value(40)); - -CREATE INDEX val_only_idx - on seq_region_attrib (value(40)); - -CREATE TABLE seq_region_synonym -( - seq_region_synonym_id int unsigned auto_increment - primary key, - seq_region_id int unsigned not null, - synonym varchar(250) not null, - external_db_id int unsigned null, - constraint syn_idx - unique (synonym, seq_region_id) -); - -CREATE INDEX seq_region_idx - on seq_region_synonym (seq_region_id); - -CREATE TABLE `attrib_type` ( - `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, - `code` varchar(20) NOT NULL DEFAULT '', - `name` varchar(255) NOT NULL DEFAULT '', - `description` text, - PRIMARY KEY (`attrib_type_id`), - UNIQUE KEY `code_idx` (`code`) -); \ No newline at end of file diff --git a/src/tests/databases/core_4.db b/src/tests/databases/core_4.db new file mode 100644 index 00000000..2742c71a Binary files /dev/null and b/src/tests/databases/core_4.db differ diff --git a/src/tests/databases/core_4/attrib_type.txt b/src/tests/databases/core_4/attrib_type.txt deleted file mode 100644 index de5f1880..00000000 --- a/src/tests/databases/core_4/attrib_type.txt +++ /dev/null @@ -1,2 +0,0 @@ -6 toplevel Top Level Top Level Non-Redundant Sequence Region -547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_4/coord_system.txt b/src/tests/databases/core_4/coord_system.txt deleted file mode 100644 index 51314bf1..00000000 --- a/src/tests/databases/core_4/coord_system.txt +++ /dev/null @@ -1 +0,0 @@ -1 1 primary_assembly test 1 default_version,sequence_level diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt deleted file mode 100644 index c0ecec65..00000000 --- a/src/tests/databases/core_4/meta.txt +++ /dev/null @@ -1,23 +0,0 @@ -12 1 assembly.accession weird02 -14 1 assembly.default jaber01 -13 1 assembly.name jaber01 -11 1 assembly.ucsc_alias SCARYIER -15 1 gencode.version 999 -3 1 organism.common_name jabberwocky -7 1 organism.division Ensembl_TEST -6 1 organism.production_name Jabberwocky -4 1 organism.scientific_name carol_jabberwocky -1 1 organism.species_taxonomy_id 6666666 -8 1 organism.strain reference -9 1 organism.strain_group testing -2 1 organism.taxonomy_id 666668 -10 1 organism.type monsters -5 1 organism.url Jabbe -17 1 genebuild.version ENS02 -18 1 genebuild.sample_gene ENSAMXG00005000318 -19 1 genebuild.sample_location KB871578.1:9766653-9817473 -20 1 strain.type test -23 1 genebuild.provider_name test -24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 -26 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_4/seq_region.txt b/src/tests/databases/core_4/seq_region.txt deleted file mode 100644 index a2216feb..00000000 --- a/src/tests/databases/core_4/seq_region.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 TEST1_seq 1 666666 -2 TEST2_seq 1 666 -3 TEST3_seq 1 1666666 diff --git a/src/tests/databases/core_4/seq_region_attrib.txt b/src/tests/databases/core_4/seq_region_attrib.txt deleted file mode 100644 index d8dcda33..00000000 --- a/src/tests/databases/core_4/seq_region_attrib.txt +++ /dev/null @@ -1,6 +0,0 @@ -1 6 1 -2 6 1 -3 6 1 -1 547 nuclear_chromosome -2 547 nuclear_chromosome -3 547 mitochondrial_chromosome diff --git a/src/tests/databases/core_4/seq_region_synonym.txt b/src/tests/databases/core_4/seq_region_synonym.txt deleted file mode 100644 index de43d915..00000000 --- a/src/tests/databases/core_4/seq_region_synonym.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 1 TEST1_seq 50710 -2 2 TEST2_seq 50710 -3 3 TEST3_seq 50710 diff --git a/src/tests/databases/core_4/table.sql b/src/tests/databases/core_4/table.sql deleted file mode 100644 index 22e5c915..00000000 --- a/src/tests/databases/core_4/table.sql +++ /dev/null @@ -1,87 +0,0 @@ -CREATE TABLE coord_system -( - coord_system_id int unsigned auto_increment - primary key, - species_id int unsigned default 1 not null, - name varchar(40) not null, - version varchar(255) null, - `rank` int not null, - attrib set ('default_version', 'sequence_level') null, - constraint name_idx - unique (name, version, species_id), - constraint rank_idx - unique (`rank`, species_id) -); - -CREATE INDEX species_idx - on coord_system (species_id); - -CREATE TABLE meta -( - meta_id int auto_increment - primary key, - species_id int unsigned default 1 null, - meta_key varchar(40) not null, - meta_value varchar(255) not null, - constraint species_key_value_idx - unique (species_id, meta_key, meta_value) -); - -CREATE INDEX species_value_idx - on meta (species_id, meta_value); - -CREATE TABLE seq_region -( - seq_region_id int unsigned auto_increment - primary key, - name varchar(255) not null, - coord_system_id int unsigned not null, - length int unsigned not null, - constraint name_cs_idx - unique (name, coord_system_id) -); - -CREATE INDEX cs_idx - on seq_region (coord_system_id); - -CREATE TABLE seq_region_attrib -( - seq_region_id int unsigned default 0 not null, - attrib_type_id smallint unsigned default 0 not null, - value text not null, - constraint region_attribx - unique (seq_region_id, attrib_type_id, value(500)) -); - -CREATE INDEX seq_region_idx - on seq_region_attrib (seq_region_id); - -CREATE INDEX type_val_idx - on seq_region_attrib (attrib_type_id, value(40)); - -CREATE INDEX val_only_idx - on seq_region_attrib (value(40)); - -CREATE TABLE seq_region_synonym -( - seq_region_synonym_id int unsigned auto_increment - primary key, - seq_region_id int unsigned not null, - synonym varchar(250) not null, - external_db_id int unsigned null, - constraint syn_idx - unique (synonym, seq_region_id) -); - -CREATE INDEX seq_region_idx - on seq_region_synonym (seq_region_id); - -CREATE TABLE `attrib_type` ( - `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, - `code` varchar(20) NOT NULL DEFAULT '', - `name` varchar(255) NOT NULL DEFAULT '', - `description` text, - PRIMARY KEY (`attrib_type_id`), - UNIQUE KEY `code_idx` (`code`) -); - diff --git a/src/tests/databases/core_5.db b/src/tests/databases/core_5.db new file mode 100644 index 00000000..9e66d495 Binary files /dev/null and b/src/tests/databases/core_5.db differ diff --git a/src/tests/databases/core_5/attrib_type.txt b/src/tests/databases/core_5/attrib_type.txt deleted file mode 100644 index de5f1880..00000000 --- a/src/tests/databases/core_5/attrib_type.txt +++ /dev/null @@ -1,2 +0,0 @@ -6 toplevel Top Level Top Level Non-Redundant Sequence Region -547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_5/coord_system.txt b/src/tests/databases/core_5/coord_system.txt deleted file mode 100644 index 51314bf1..00000000 --- a/src/tests/databases/core_5/coord_system.txt +++ /dev/null @@ -1 +0,0 @@ -1 1 primary_assembly test 1 default_version,sequence_level diff --git a/src/tests/databases/core_5/meta.txt b/src/tests/databases/core_5/meta.txt deleted file mode 100644 index 3432d48d..00000000 --- a/src/tests/databases/core_5/meta.txt +++ /dev/null @@ -1,20 +0,0 @@ -12 1 assembly.accession test1 -14 1 assembly.default test846 -13 1 assembly.name test1 -11 1 assembly.ucsc_alias test1 -7 1 organism.division Ensembl_TEST -6 1 organism.production_name test_case_5 -4 1 organism.scientific_name Hominoide -8 1 organism.strain reference -9 1 organism.strain_group Hominoide -2 1 organism.taxonomy_id 9940 -10 1 organism.type monsters -5 1 organism.url Hominoide -17 1 genebuild.version ENS01 -18 1 genebuild.sample_gene ENSAMXG00005000318 -19 1 genebuild.sample_location KB871578.1:9766653-9817473 -20 1 strain.type test -23 1 genebuild.provider_name removed_for_test -24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 -26 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_5/seq_region.txt b/src/tests/databases/core_5/seq_region.txt deleted file mode 100644 index a2216feb..00000000 --- a/src/tests/databases/core_5/seq_region.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 TEST1_seq 1 666666 -2 TEST2_seq 1 666 -3 TEST3_seq 1 1666666 diff --git a/src/tests/databases/core_5/seq_region_attrib.txt b/src/tests/databases/core_5/seq_region_attrib.txt deleted file mode 100644 index d8dcda33..00000000 --- a/src/tests/databases/core_5/seq_region_attrib.txt +++ /dev/null @@ -1,6 +0,0 @@ -1 6 1 -2 6 1 -3 6 1 -1 547 nuclear_chromosome -2 547 nuclear_chromosome -3 547 mitochondrial_chromosome diff --git a/src/tests/databases/core_5/seq_region_synonym.txt b/src/tests/databases/core_5/seq_region_synonym.txt deleted file mode 100644 index de43d915..00000000 --- a/src/tests/databases/core_5/seq_region_synonym.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 1 TEST1_seq 50710 -2 2 TEST2_seq 50710 -3 3 TEST3_seq 50710 diff --git a/src/tests/databases/core_5/table.sql b/src/tests/databases/core_5/table.sql deleted file mode 100644 index 22e5c915..00000000 --- a/src/tests/databases/core_5/table.sql +++ /dev/null @@ -1,87 +0,0 @@ -CREATE TABLE coord_system -( - coord_system_id int unsigned auto_increment - primary key, - species_id int unsigned default 1 not null, - name varchar(40) not null, - version varchar(255) null, - `rank` int not null, - attrib set ('default_version', 'sequence_level') null, - constraint name_idx - unique (name, version, species_id), - constraint rank_idx - unique (`rank`, species_id) -); - -CREATE INDEX species_idx - on coord_system (species_id); - -CREATE TABLE meta -( - meta_id int auto_increment - primary key, - species_id int unsigned default 1 null, - meta_key varchar(40) not null, - meta_value varchar(255) not null, - constraint species_key_value_idx - unique (species_id, meta_key, meta_value) -); - -CREATE INDEX species_value_idx - on meta (species_id, meta_value); - -CREATE TABLE seq_region -( - seq_region_id int unsigned auto_increment - primary key, - name varchar(255) not null, - coord_system_id int unsigned not null, - length int unsigned not null, - constraint name_cs_idx - unique (name, coord_system_id) -); - -CREATE INDEX cs_idx - on seq_region (coord_system_id); - -CREATE TABLE seq_region_attrib -( - seq_region_id int unsigned default 0 not null, - attrib_type_id smallint unsigned default 0 not null, - value text not null, - constraint region_attribx - unique (seq_region_id, attrib_type_id, value(500)) -); - -CREATE INDEX seq_region_idx - on seq_region_attrib (seq_region_id); - -CREATE INDEX type_val_idx - on seq_region_attrib (attrib_type_id, value(40)); - -CREATE INDEX val_only_idx - on seq_region_attrib (value(40)); - -CREATE TABLE seq_region_synonym -( - seq_region_synonym_id int unsigned auto_increment - primary key, - seq_region_id int unsigned not null, - synonym varchar(250) not null, - external_db_id int unsigned null, - constraint syn_idx - unique (synonym, seq_region_id) -); - -CREATE INDEX seq_region_idx - on seq_region_synonym (seq_region_id); - -CREATE TABLE `attrib_type` ( - `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, - `code` varchar(20) NOT NULL DEFAULT '', - `name` varchar(255) NOT NULL DEFAULT '', - `description` text, - PRIMARY KEY (`attrib_type_id`), - UNIQUE KEY `code_idx` (`code`) -); - diff --git a/src/tests/databases/core_6.db b/src/tests/databases/core_6.db new file mode 100644 index 00000000..81fb3d8d Binary files /dev/null and b/src/tests/databases/core_6.db differ diff --git a/src/tests/databases/core_6/attrib_type.txt b/src/tests/databases/core_6/attrib_type.txt deleted file mode 100644 index de5f1880..00000000 --- a/src/tests/databases/core_6/attrib_type.txt +++ /dev/null @@ -1,2 +0,0 @@ -6 toplevel Top Level Top Level Non-Redundant Sequence Region -547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_6/coord_system.txt b/src/tests/databases/core_6/coord_system.txt deleted file mode 100644 index 51314bf1..00000000 --- a/src/tests/databases/core_6/coord_system.txt +++ /dev/null @@ -1 +0,0 @@ -1 1 primary_assembly test 1 default_version,sequence_level diff --git a/src/tests/databases/core_6/meta.txt b/src/tests/databases/core_6/meta.txt deleted file mode 100644 index e86f1e13..00000000 --- a/src/tests/databases/core_6/meta.txt +++ /dev/null @@ -1,25 +0,0 @@ -12 1 assembly.accession weird01 -14 1 assembly.default jaber01 -13 1 assembly.name jaber01 -11 1 assembly.ucsc_alias SCARY -15 1 gencode.version 999 -16 1 genebuild.last_geneset_update 01 -3 1 organism.common_name jabberwocky -7 1 organism.division Ensembl_TEST -6 1 organism.production_name Jabberwocky -4 1 organism.scientific_name carol_jabberwocky -1 1 organism.species_taxonomy_id 6666666 -8 1 organism.strain reference -9 1 organism.strain_group testing -2 1 organism.taxonomy_id 666668 -10 1 organism.type monsters -5 1 organism.url Jabbe -17 1 genebuild.version ENS01 -18 1 genebuild.sample_gene ENSAMXG00005000318 -19 1 genebuild.sample_location KB871578.1:9766653-9817473 -20 1 strain.type test -21 1 genome.genome_uuid 90720316-006c-470b-a7dd-82d28f952264 -23 1 genebuild.provider_name test -24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 -26 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_6/seq_region.txt b/src/tests/databases/core_6/seq_region.txt deleted file mode 100644 index a2216feb..00000000 --- a/src/tests/databases/core_6/seq_region.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 TEST1_seq 1 666666 -2 TEST2_seq 1 666 -3 TEST3_seq 1 1666666 diff --git a/src/tests/databases/core_6/seq_region_attrib.txt b/src/tests/databases/core_6/seq_region_attrib.txt deleted file mode 100644 index d8dcda33..00000000 --- a/src/tests/databases/core_6/seq_region_attrib.txt +++ /dev/null @@ -1,6 +0,0 @@ -1 6 1 -2 6 1 -3 6 1 -1 547 nuclear_chromosome -2 547 nuclear_chromosome -3 547 mitochondrial_chromosome diff --git a/src/tests/databases/core_6/seq_region_synonym.txt b/src/tests/databases/core_6/seq_region_synonym.txt deleted file mode 100644 index de43d915..00000000 --- a/src/tests/databases/core_6/seq_region_synonym.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 1 TEST1_seq 50710 -2 2 TEST2_seq 50710 -3 3 TEST3_seq 50710 diff --git a/src/tests/databases/core_6/table.sql b/src/tests/databases/core_6/table.sql deleted file mode 100644 index 953da984..00000000 --- a/src/tests/databases/core_6/table.sql +++ /dev/null @@ -1,86 +0,0 @@ -CREATE TABLE coord_system -( - coord_system_id int unsigned auto_increment - primary key, - species_id int unsigned default 1 not null, - name varchar(40) not null, - version varchar(255) null, - `rank` int not null, - attrib set ('default_version', 'sequence_level') null, - constraint name_idx - unique (name, version, species_id), - constraint rank_idx - unique (`rank`, species_id) -); - -CREATE INDEX species_idx - on coord_system (species_id); - -CREATE TABLE meta -( - meta_id int auto_increment - primary key, - species_id int unsigned default 1 null, - meta_key varchar(40) not null, - meta_value varchar(255) not null, - constraint species_key_value_idx - unique (species_id, meta_key, meta_value) -); - -CREATE INDEX species_value_idx - on meta (species_id, meta_value); - -CREATE TABLE seq_region -( - seq_region_id int unsigned auto_increment - primary key, - name varchar(255) not null, - coord_system_id int unsigned not null, - length int unsigned not null, - constraint name_cs_idx - unique (name, coord_system_id) -); - -CREATE INDEX cs_idx - on seq_region (coord_system_id); - -CREATE TABLE seq_region_attrib -( - seq_region_id int unsigned default 0 not null, - attrib_type_id smallint unsigned default 0 not null, - value text not null, - constraint region_attribx - unique (seq_region_id, attrib_type_id, value(500)) -); - -CREATE INDEX seq_region_idx - on seq_region_attrib (seq_region_id); - -CREATE INDEX type_val_idx - on seq_region_attrib (attrib_type_id, value(40)); - -CREATE INDEX val_only_idx - on seq_region_attrib (value(40)); - -CREATE TABLE seq_region_synonym -( - seq_region_synonym_id int unsigned auto_increment - primary key, - seq_region_id int unsigned not null, - synonym varchar(250) not null, - external_db_id int unsigned null, - constraint syn_idx - unique (synonym, seq_region_id) -); - -CREATE INDEX seq_region_idx - on seq_region_synonym (seq_region_id); - -CREATE TABLE `attrib_type` ( - `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, - `code` varchar(20) NOT NULL DEFAULT '', - `name` varchar(255) NOT NULL DEFAULT '', - `description` text, - PRIMARY KEY (`attrib_type_id`), - UNIQUE KEY `code_idx` (`code`) -); \ No newline at end of file diff --git a/src/tests/databases/core_7.db b/src/tests/databases/core_7.db new file mode 100644 index 00000000..cad377be Binary files /dev/null and b/src/tests/databases/core_7.db differ diff --git a/src/tests/databases/core_7/attrib_type.txt b/src/tests/databases/core_7/attrib_type.txt deleted file mode 100644 index 59c569dd..00000000 --- a/src/tests/databases/core_7/attrib_type.txt +++ /dev/null @@ -1,3 +0,0 @@ -6 toplevel Top Level Top Level Non-Redundant Sequence Region -547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". -316 circular_seq Circular sequence Circular chromosome or plasmid molecule \ No newline at end of file diff --git a/src/tests/databases/core_7/coord_system.txt b/src/tests/databases/core_7/coord_system.txt deleted file mode 100644 index 51314bf1..00000000 --- a/src/tests/databases/core_7/coord_system.txt +++ /dev/null @@ -1 +0,0 @@ -1 1 primary_assembly test 1 default_version,sequence_level diff --git a/src/tests/databases/core_7/meta.txt b/src/tests/databases/core_7/meta.txt deleted file mode 100644 index 4ce72caa..00000000 --- a/src/tests/databases/core_7/meta.txt +++ /dev/null @@ -1,26 +0,0 @@ -12 1 assembly.accession test1 -14 1 assembly.default NewTest -13 1 assembly.name jaber01 -11 1 assembly.ucsc_alias test_alias -15 1 gencode.version 999 -16 1 genebuild.last_geneset_update 01 -3 1 organism.common_name jabberwocky -7 1 organism.division Ensembl_TEST -6 1 organism.production_name Jabberwocky -4 1 organism.scientific_name carol_jabberwocky -1 1 organism.species_taxonomy_id 6666666 -8 1 organism.strain reference -9 1 organism.strain_group testing -2 1 organism.taxonomy_id 666668 -10 1 organism.type monsters -5 1 organism.url Jabbe -17 1 genebuild.version ENS01 -18 1 genebuild.sample_gene ENSAMXG00005000318 -19 1 genebuild.sample_location KB871578.1:9766653-9817473 -20 1 strain.type test -23 1 genebuild.provider_name testfornamenew -24 1 genebuild.start_date 2023-08-Ensembl -25 1 genebuild.havana_datafreeze_date test2 -26 \N schema_version 110 -27 1 assembly.stats.total_coding_sequence_length 8989 -28 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_7/seq_region.txt b/src/tests/databases/core_7/seq_region.txt deleted file mode 100644 index 535c1393..00000000 --- a/src/tests/databases/core_7/seq_region.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 TEST1_seq_update 1 666666 -2 TEST2_seq_update 1 666 -3 TEST3_seq_update 1 1666666 diff --git a/src/tests/databases/core_7/seq_region_attrib.txt b/src/tests/databases/core_7/seq_region_attrib.txt deleted file mode 100644 index aad2591e..00000000 --- a/src/tests/databases/core_7/seq_region_attrib.txt +++ /dev/null @@ -1,8 +0,0 @@ -1 6 1 -2 6 1 -3 6 1 -1 547 nuclear_chromosome -2 547 nuclear_chromosome -3 547 mitochondrial_chromosome -1 316 1 -2 316 0 \ No newline at end of file diff --git a/src/tests/databases/core_7/seq_region_synonym.txt b/src/tests/databases/core_7/seq_region_synonym.txt deleted file mode 100644 index de43d915..00000000 --- a/src/tests/databases/core_7/seq_region_synonym.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 1 TEST1_seq 50710 -2 2 TEST2_seq 50710 -3 3 TEST3_seq 50710 diff --git a/src/tests/databases/core_7/table.sql b/src/tests/databases/core_7/table.sql deleted file mode 100644 index 953da984..00000000 --- a/src/tests/databases/core_7/table.sql +++ /dev/null @@ -1,86 +0,0 @@ -CREATE TABLE coord_system -( - coord_system_id int unsigned auto_increment - primary key, - species_id int unsigned default 1 not null, - name varchar(40) not null, - version varchar(255) null, - `rank` int not null, - attrib set ('default_version', 'sequence_level') null, - constraint name_idx - unique (name, version, species_id), - constraint rank_idx - unique (`rank`, species_id) -); - -CREATE INDEX species_idx - on coord_system (species_id); - -CREATE TABLE meta -( - meta_id int auto_increment - primary key, - species_id int unsigned default 1 null, - meta_key varchar(40) not null, - meta_value varchar(255) not null, - constraint species_key_value_idx - unique (species_id, meta_key, meta_value) -); - -CREATE INDEX species_value_idx - on meta (species_id, meta_value); - -CREATE TABLE seq_region -( - seq_region_id int unsigned auto_increment - primary key, - name varchar(255) not null, - coord_system_id int unsigned not null, - length int unsigned not null, - constraint name_cs_idx - unique (name, coord_system_id) -); - -CREATE INDEX cs_idx - on seq_region (coord_system_id); - -CREATE TABLE seq_region_attrib -( - seq_region_id int unsigned default 0 not null, - attrib_type_id smallint unsigned default 0 not null, - value text not null, - constraint region_attribx - unique (seq_region_id, attrib_type_id, value(500)) -); - -CREATE INDEX seq_region_idx - on seq_region_attrib (seq_region_id); - -CREATE INDEX type_val_idx - on seq_region_attrib (attrib_type_id, value(40)); - -CREATE INDEX val_only_idx - on seq_region_attrib (value(40)); - -CREATE TABLE seq_region_synonym -( - seq_region_synonym_id int unsigned auto_increment - primary key, - seq_region_id int unsigned not null, - synonym varchar(250) not null, - external_db_id int unsigned null, - constraint syn_idx - unique (synonym, seq_region_id) -); - -CREATE INDEX seq_region_idx - on seq_region_synonym (seq_region_id); - -CREATE TABLE `attrib_type` ( - `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, - `code` varchar(20) NOT NULL DEFAULT '', - `name` varchar(255) NOT NULL DEFAULT '', - `description` text, - PRIMARY KEY (`attrib_type_id`), - UNIQUE KEY `code_idx` (`code`) -); \ No newline at end of file diff --git a/src/tests/databases/core_8.db b/src/tests/databases/core_8.db new file mode 100644 index 00000000..b2d77e66 Binary files /dev/null and b/src/tests/databases/core_8.db differ diff --git a/src/tests/databases/core_8/attrib_type.txt b/src/tests/databases/core_8/attrib_type.txt deleted file mode 100644 index 59c569dd..00000000 --- a/src/tests/databases/core_8/attrib_type.txt +++ /dev/null @@ -1,3 +0,0 @@ -6 toplevel Top Level Top Level Non-Redundant Sequence Region -547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". -316 circular_seq Circular sequence Circular chromosome or plasmid molecule \ No newline at end of file diff --git a/src/tests/databases/core_8/coord_system.txt b/src/tests/databases/core_8/coord_system.txt deleted file mode 100644 index 51314bf1..00000000 --- a/src/tests/databases/core_8/coord_system.txt +++ /dev/null @@ -1 +0,0 @@ -1 1 primary_assembly test 1 default_version,sequence_level diff --git a/src/tests/databases/core_8/meta.txt b/src/tests/databases/core_8/meta.txt deleted file mode 100644 index 725a1ac8..00000000 --- a/src/tests/databases/core_8/meta.txt +++ /dev/null @@ -1,25 +0,0 @@ -12 1 assembly.accession GCA_000002985.3 -14 1 assembly.default jaber01 -13 1 assembly.name jaber01 -11 1 assembly.ucsc_alias SCARY -15 1 gencode.version 999 -16 1 genebuild.last_geneset_update 01 -3 1 organism.biosample_id SAMN04256190 -3 1 organism.common_name Caenorhabditis elegans (PRJNA13758) -7 1 organism.division Ensembl_TEST -6 1 organism.production_name Caenorhabditis_elegans -4 1 organism.scientific_name Caenorhabditis elegans -1 1 organism.species_taxonomy_id 6239 -8 1 organism.strain N2 -9 1 organism.strain_group testing -2 1 organism.taxonomy_id 6239 -10 1 organism.type monsters -5 1 organism.url Jabbe -17 1 genebuild.version EXT01 -18 1 genebuild.sample_gene ENSAMXG00005000318 -19 1 genebuild.sample_location KB871578.1:9766653-9817473 -20 1 strain.type test -23 1 genebuild.provider_name test -24 1 genebuild.start_date 2023-07-Ensembl -25 \N schema_version 110 -29 1 genebuild.last_geneset_update 2023-01 diff --git a/src/tests/databases/core_8/seq_region.txt b/src/tests/databases/core_8/seq_region.txt deleted file mode 100644 index e67ee5a7..00000000 --- a/src/tests/databases/core_8/seq_region.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 TEST1_seqA 1 666666 -2 TEST2_seqB 1 666 -3 TEST3_seqC 1 1666666 diff --git a/src/tests/databases/core_8/seq_region_attrib.txt b/src/tests/databases/core_8/seq_region_attrib.txt deleted file mode 100644 index aad2591e..00000000 --- a/src/tests/databases/core_8/seq_region_attrib.txt +++ /dev/null @@ -1,8 +0,0 @@ -1 6 1 -2 6 1 -3 6 1 -1 547 nuclear_chromosome -2 547 nuclear_chromosome -3 547 mitochondrial_chromosome -1 316 1 -2 316 0 \ No newline at end of file diff --git a/src/tests/databases/core_8/seq_region_synonym.txt b/src/tests/databases/core_8/seq_region_synonym.txt deleted file mode 100644 index de43d915..00000000 --- a/src/tests/databases/core_8/seq_region_synonym.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 1 TEST1_seq 50710 -2 2 TEST2_seq 50710 -3 3 TEST3_seq 50710 diff --git a/src/tests/databases/core_8/table.sql b/src/tests/databases/core_8/table.sql deleted file mode 100644 index 953da984..00000000 --- a/src/tests/databases/core_8/table.sql +++ /dev/null @@ -1,86 +0,0 @@ -CREATE TABLE coord_system -( - coord_system_id int unsigned auto_increment - primary key, - species_id int unsigned default 1 not null, - name varchar(40) not null, - version varchar(255) null, - `rank` int not null, - attrib set ('default_version', 'sequence_level') null, - constraint name_idx - unique (name, version, species_id), - constraint rank_idx - unique (`rank`, species_id) -); - -CREATE INDEX species_idx - on coord_system (species_id); - -CREATE TABLE meta -( - meta_id int auto_increment - primary key, - species_id int unsigned default 1 null, - meta_key varchar(40) not null, - meta_value varchar(255) not null, - constraint species_key_value_idx - unique (species_id, meta_key, meta_value) -); - -CREATE INDEX species_value_idx - on meta (species_id, meta_value); - -CREATE TABLE seq_region -( - seq_region_id int unsigned auto_increment - primary key, - name varchar(255) not null, - coord_system_id int unsigned not null, - length int unsigned not null, - constraint name_cs_idx - unique (name, coord_system_id) -); - -CREATE INDEX cs_idx - on seq_region (coord_system_id); - -CREATE TABLE seq_region_attrib -( - seq_region_id int unsigned default 0 not null, - attrib_type_id smallint unsigned default 0 not null, - value text not null, - constraint region_attribx - unique (seq_region_id, attrib_type_id, value(500)) -); - -CREATE INDEX seq_region_idx - on seq_region_attrib (seq_region_id); - -CREATE INDEX type_val_idx - on seq_region_attrib (attrib_type_id, value(40)); - -CREATE INDEX val_only_idx - on seq_region_attrib (value(40)); - -CREATE TABLE seq_region_synonym -( - seq_region_synonym_id int unsigned auto_increment - primary key, - seq_region_id int unsigned not null, - synonym varchar(250) not null, - external_db_id int unsigned null, - constraint syn_idx - unique (synonym, seq_region_id) -); - -CREATE INDEX seq_region_idx - on seq_region_synonym (seq_region_id); - -CREATE TABLE `attrib_type` ( - `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, - `code` varchar(20) NOT NULL DEFAULT '', - `name` varchar(255) NOT NULL DEFAULT '', - `description` text, - PRIMARY KEY (`attrib_type_id`), - UNIQUE KEY `code_idx` (`code`) -); \ No newline at end of file diff --git a/src/tests/databases/ensembl_genome_metadata.db b/src/tests/databases/ensembl_genome_metadata.db new file mode 100644 index 00000000..cbbab0c0 Binary files /dev/null and b/src/tests/databases/ensembl_genome_metadata.db differ diff --git a/src/tests/databases/ensembl_genome_metadata/assembly.txt b/src/tests/databases/ensembl_genome_metadata/assembly.txt deleted file mode 100644 index 084db5df..00000000 --- a/src/tests/databases/ensembl_genome_metadata/assembly.txt +++ /dev/null @@ -1,19 +0,0 @@ -1 \N GCA_000005845.2 chromosome ASM584v2 \N ASM584v2 \N 2023-09-22 15:01:43.000000 ASM584v2 \N 532aa68f-6500-404e-a470-8afb718a770a 1 asm584v2 -4 \N GCA_018473315.1 primary_assembly HG03540.alt.pat.f1_v2 \N HG03540.alt.pat.f1_v2 \N 2023-09-22 15:02:00.000000 HG03540.alt.pat.f1_v2 \N 9d2dc346-358a-4c70-8fd8-3ff194246a76 0 \N -5 \N GCA_018469415.1 primary_assembly HG03516.alt.pat.f1_v2 \N HG03516.alt.pat.f1_v2 \N 2023-09-22 15:02:01.000000 HG03516.alt.pat.f1_v2 \N 1551e511-bde7-40cf-95cd-de4059678c6f 0 \N -6 \N GCA_018469875.1 primary_assembly HG02622.pri.mat.f1_v2 \N HG02622.pri.mat.f1_v2 \N 2023-09-22 15:02:02.000000 HG02622.pri.mat.f1_v2 \N 960de156-eced-4916-ac64-263d9a89dc3b 0 \N -7 \N GCA_018505825.1 primary_assembly HG02109.pri.mat.f1_v2 \N HG02109.pri.mat.f1_v2 \N 2023-09-22 15:02:04.000000 HG02109.pri.mat.f1_v2 \N fc4e0ec5-7230-44b9-92aa-6788356158a8 0 \N -9 \N GCA_018852615.1 primary_assembly HG002.pri.mat.f1_v2 \N HG002.pri.mat.f1_v2 \N 2023-09-22 15:02:11.000000 HG002.pri.mat.f1_v2 \N 96b3f68d-d3d2-4107-a003-39cb0d67075f 0 \N -15 \N GCA_000002765.2 chromosome ASM276v2 \N ASM276v2 \N 2023-09-22 15:03:01.000000 ASM276v2 \N 23d2caa4-5120-4cc7-a73a-42aad4b6b1d9 1 asm276v2 -18 \N GCA_021950905.1 primary_assembly HG002.pat.cur.20211005 \N HG002.pat.cur.20211005 \N 2023-09-22 15:03:01.000000 HG002.pat.cur.20211005 \N 7a191f4e-0840-4aed-9302-8fab1157a361 0 \N -40 hg19 GCA_000001405.14 chromosome GRCh37.p13 \N GRCh37 \N 2023-09-22 15:03:21.000000 GRCh37.p13 \N 9d6b239c-46dd-4c79-bc29-1089f348d31d 0 grch37 -79 \N GCA_900519105.1 chromosome IWGSC \N IWGSC \N 2023-09-22 15:04:29.000000 IWGSC \N 36d6c4f3-8072-4ae3-a485-84a070e725e3 1 iwgsc -92 hg38 GCA_000001405.29 chromosome GRCh38.p14 \N GRCh38 \N 2023-09-22 15:04:45.000000 GRCh38.p14 \N fd7fea38-981a-4d73-a879-6f9daef86f08 1 grch38 -97 \N GCA_018505865.1 primary_assembly HG02109.alt.pat.f1_v2 \N HG02109.alt.pat.f1_v2 \N 2023-09-22 15:04:50.000000 HG02109.alt.pat.f1_v2 \N 373c34c3-d482-4ebb-8f48-baee7c548583 0 \N -100 \N GCA_018852605.1 primary_assembly HG002.alt.pat.f1_v2 \N HG002.alt.pat.f1_v2 \N 2023-09-22 15:04:53.000000 HG002.alt.pat.f1_v2 \N b6883d52-cc9d-43d2-bdbd-97bdaf903cec 0 \N -107 \N GCA_018469925.1 primary_assembly HG02622.alt.pat.f1_v2 \N HG02622.alt.pat.f1_v2 \N 2023-09-22 15:04:56.000000 HG02622.alt.pat.f1_v2 \N a283efd6-d125-47df-8b3c-4757ae496231 0 \N -135 \N GCA_018469425.1 primary_assembly HG03516.pri.mat.f1_v2 \N HG03516.pri.mat.f1_v2 \N 2023-09-22 15:05:37.000000 HG03516.pri.mat.f1_v2 \N 93f7cd36-49e3-4c89-826b-3b2e4be0c40a 0 \N -180 \N GCA_021951015.1 primary_assembly HG002.mat.cur.20211005 \N HG002.mat.cur.20211005 \N 2023-09-22 15:06:39.000000 HG002.mat.cur.20211005 \N 696aa33e-f239-460e-9fcc-b6bb6908d726 0 \N -186 \N GCA_018473295.1 primary_assembly HG03540.pri.mat.f1_v2 \N HG03540.pri.mat.f1_v2 \N 2023-09-22 15:06:43.000000 HG03540.pri.mat.f1_v2 \N 8c71dc33-a49f-4be3-a3ad-4404fb374344 0 \N -216 \N GCA_000146045.2 chromosome R64-1-1 \N R64-1-1 \N 2023-09-22 15:06:55.000000 R64-1-1 \N 86cb493f-57cf-4c5a-8358-ef69952baf03 1 r64-1-1 -219 \N GCA_000002985.3 chromosome WBcel235 \N WBcel235 \N 2023-09-22 15:06:58.000000 WBcel235 \N 2598e56f-a579-4fec-9525-0939563056bd 1 wbcel235 diff --git a/src/tests/databases/ensembl_genome_metadata/assembly_sequence.txt b/src/tests/databases/ensembl_genome_metadata/assembly_sequence.txt deleted file mode 100644 index d89dd85e..00000000 --- a/src/tests/databases/ensembl_genome_metadata/assembly_sequence.txt +++ /dev/null @@ -1,113 +0,0 @@ -1871 1 1 1 249250621 SO:0000738 1b22b98cdeb4a9304cb5d48026a85128 40 1 S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU 0 -1872 10 10 1 135534747 SO:0000738 988c28e000e84c26d552359af1ea2e1d 40 10 -BOZ8Esn8J88qDwNiSEwUr5425UXdiGX 0 -1873 11 11 1 135006516 SO:0000738 98c59049a2df285c76ffb1c6db8f8b96 40 11 XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG 0 -1874 12 12 1 133851895 SO:0000738 51851ac0e1a115847ad36449b0015864 40 12 105bBysLoDFQHhajooTAUyUkNiZ8LJEH 0 -1875 13 13 1 115169878 SO:0000738 283f8d7892baa81b510a015719ca7b0b 40 13 Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH 0 -1876 14 14 1 107349540 SO:0000738 98f3cae32b2a2e9524bc19813927542e 40 14 5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf 0 -1877 15 15 1 102531392 SO:0000738 e5645a794a8238215b2cd77acb95a078 40 15 zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt 0 -1878 16 16 1 90354753 SO:0000738 fc9b1a7b42b97a864f56b348b06095e6 40 16 W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb 0 -1879 17 17 1 81195210 SO:0000738 351f64d4f4f9ddd45b35336ad97aa6de 40 17 AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz 0 -1880 18 18 1 78077248 SO:0000738 b15d4b2d29dde9d3e4f93d1d0f2cbc9c 40 18 BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD 0 -1893 GL000191.1 GL000191.1 0 106433 SO:0000738 d75b436f50a8214ee9c2a51d30b2c2cc 40 \N aX54PSRCZbj0EVn5QAH4zoO72gsmSTO8 0 -1894 GL000192.1 GL000192.1 0 547496 SO:0000738 325ba9e808f669dfeee210fdd7b470ac 40 \N udMJQEKIyWPe8YGW1Dws6IHk_1NbkY9Y 0 -1895 GL000193.1 GL000193.1 0 189789 SO:0000738 dbb6e8ece0b5de29da56601613007c2a 40 \N be3_RQlT0dXc4jYLkbEiRC6HSl7u1FjF 0 -1896 GL000194.1 GL000194.1 0 191469 SO:0000738 6ac8f815bf8e845bb3031b73f812c012 40 \N WyYCLC4VxJvbBz2b_wBWF5BdQotiUVdB 0 -1897 GL000195.1 GL000195.1 0 182896 SO:0000738 5d9ec007868d517e73543b005ba48535 40 \N 2LEWMcieZGf9Sx4VpEeWSDcULUVHGm0w 0 -1955 1 1 1 248956422 SO:0000738 2648ae1bacce4ec4b6cf337dcae37816 92 1 2YnepKM7OkBoOrKmvHbGqguVfF9amCST 0 -1956 10 10 1 133797422 SO:0000738 907112d17fcb73bcab1ed1c72b97ce68 92 10 P6q4sxSkFfKZpUgEwW73rx2a2ZYY-_pH 0 -1957 11 11 1 135086622 SO:0000738 1511375dc2dd1b633af8cf439ae90cec 92 11 2NkFm8HK88MqeNkCgj78KidCAXgnsfV1 0 -1958 12 12 1 133275309 SO:0000738 e81e16d3f44337034695a29b97708fce 92 12 7dzBrNZj_CM_Dg7zLl--e18KI8wVUxEd 0 -1959 13 13 1 114364328 SO:0000738 17dab79b963ccd8e7377cef59a54fe1c 92 13 0qw_sn8Cl7OmMTFlukjFD2DUejW0T80Y 0 -1960 14 14 1 107043718 SO:0000738 acbd9552c059d9b403e75ed26c1ce5bc 92 14 eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm 0 -1961 15 15 1 101991189 SO:0000738 f036bd11158407596ca6bf3581454706 92 15 AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6 0 -1962 16 16 1 90338345 SO:0000738 24e7cabfba3548a2bb4dff582b9ee870 92 16 EjrUp_S2oCd2b_SdqeZeOYUkEC966iVh 0 -1963 17 17 1 83257441 SO:0000738 a8499ca51d6fb77332c2d242923994eb 92 17 upqChCoU-Gtd_61IidCsln-r8cxUTFeP 0 -1964 18 18 1 80373285 SO:0000738 11eeaa801f6b0e2e36a1138616b8ee9a 92 18 vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV 0 -53919 1 1 1 640851 SO:0000738 46d861cab72441c63589339b36e644ac 15 1 JfN51lUFS8sH2f2-OQ58B1f-ZFmxtjEO 0 -53920 10 10 1 1687656 SO:0000738 7f3f93983d66669bf5e18266f2565aa4 15 10 nv-h1XrxCbnvlyAlLn612PX8kJBmpf9m 0 -53921 11 11 1 2038340 SO:0000738 3733d85e3f9fd8c5284e70dc977950b7 15 11 GEeKuO07bJBfzpd4KKpQPasLKNWpBia6 0 -53922 12 12 1 2271494 SO:0000738 81d7ade8026e3099dc4a595a27ce5fe4 15 12 qoUuu7OZJT_jdUqwbk_gk-yvozwqFNdQ 0 -53923 13 13 1 2925236 SO:0000738 2e5f27c4aa13202464e20c520a90bddc 15 13 HuuhxwpHdyAZqDG3ex62hIuMZ-FHd5Io 0 -53924 14 14 1 3291936 SO:0000738 2bfeb85b8b4486aa4b03c3cb4dc56dad 15 14 mJezSE0gh0LS5XVFlfR6PZIRw3-eLyAR 0 -53925 2 2 1 947102 SO:0000738 3264ffcaf0fb7e7c6adf14088c866886 15 2 tpNy-HkCz6Cq62euyOlWNduv91huwg7p 0 -53926 3 3 1 1067971 SO:0000738 1a2a92569065a83856362480f86aaf17 15 3 YqknWYibUjSC4bFoJwSTAcXKBC4uld3x 0 -53927 4 4 1 1200490 SO:0000738 3bb29def1493995037b9426acc5cdc20 15 4 jETGmEu2CRUEy_e3SK9zDfszR9-a9UmS 0 -53928 5 5 1 1343557 SO:0000738 bf203f82beeabc7337d1dcb50bca14cf 15 5 6UHvnqsyEKDTHAzF6FZ0OHgDVVeQ6AZv 0 -53933 I I 1 15072434 SO:0000738 185711aa389cf8d9302ad2ab07bd31e0 219 1 jXmB22vaK688X3rr2HPNzTNxnkQuCTgF 0 -53934 II II 1 15279421 SO:0000738 9e7e67d1e51cdb31791deab89dc31550 219 2 7k5F0INpuaUarMm8nyI6k0nSw46G65xC 0 -53935 III III 1 13783801 SO:0000738 c0f1a58d2bf6ff6a16617839bbc5fe52 219 3 -CzkjDYLb3a8V06zz7_pT3JxsxxGsQO0 0 -53936 IV IV 1 17493829 SO:0000738 2156ab555e19afd8a0ca5aba82fb2a2a 219 4 0Tm-TKE2JHxrM9avtCG0-9cTEnqpfw2b 0 -53937 MtDNA MtDNA 1 13794 SO:0000737 199e147d502d88e45047413dc83c039c 219 7 L1L6FO8ESIZJBNTOTPK7GnZvJYiewKK0 0 -53938 V V 1 20924180 SO:0000738 ffaf018f42f95375d2af6dcd402fef69 219 5 3ID7cGPgmNoJWDTn6-At5Geri2oVP6Rv 0 -53939 X X 1 17718942 SO:0000738 dae3e9ec047e8147337b550dd8564d0b 219 6 mmZ9kKbTbAEyf09gNqHPQW4fZj8YiuTx 0 -60019 Chromosome Chromosome 1 4641652 SO:0000738 482a2b04485ec8c4b5f4eaba2c2002da 1 7 NjjHtoQ2gYdy2RVkAZBKURBiV7xp-8ZS 0 -60020 I I 1 230218 SO:0000738 6681ac2f62509cfc220d78751b8dc524 216 1 lZyxiD_ByprhOUzrR1o1bq0ezO_1gkrn 0 -60021 II II 1 813184 SO:0000738 97a317c689cbdd7e92a5c159acd290d2 216 2 vw8jTiV5SAPDH4TEIZhNGylzNsQM4NC9 0 -60022 III III 1 316620 SO:0000738 54f4a74aa6392d9e19b82c38aa8ab345 216 3 A_i2Id0FjBI-tQyU4ZaCEdxRzQheDevn 0 -60023 IV IV 1 1531933 SO:0000738 74180788027e20df3de53dcb2367d9e3 216 4 QXSUMoZW_SSsCCN9_wc-xmubKQSOn3Qb 0 -60024 IX IX 1 439888 SO:0000738 4eae53ae7b2029b7e1075461c3eb9aac 216 9 hb1scjdCWL89PtAkR0AVH9-dNH5R0FsN 0 -60025 Mito Mito 1 85779 SO:0000737 71c39cf065b8d574f636b654c274cf1b 216 17 DrOlaWEY9iBBQrAAvbrzXsQlzZRV2J59 0 -60026 V V 1 576874 SO:0000738 d2787193198c8d260f58f2097f9e1e39 216 5 UN_b-wij0EtsgFqQ2xNsbXs_GYQQIbeQ 0 -60027 VI VI 1 270161 SO:0000738 b7ebc601f9a7df2e1ec5863deeae88a3 216 6 z-qJgWoacRBV77zcMgZN9E_utrdzmQsH 0 -60028 VII VII 1 1090940 SO:0000738 a308c7ebf0b67c4926bc190dc4ba8ed8 216 7 9wkqGXgK6bvM0gcjBiTDk9tAaqOZojlR 0 -60029 VIII VIII 1 562643 SO:0000738 f66a4f8eef89fc3c3a393fe0210169f1 216 8 K8ln7Ygob_lcVjNh-C8kUydzZjRt3UDf 0 -3725167 JAGYYT010000001.1 JAGYYT010000001.1 0 46592869 SO:0000738 3ba11daa61cfe7d6244960d4bcc55113 5 \N lkW5j6Yeu6GL6xusZQCvq7KqO-3KGeh3 0 -3725168 JAGYYT010000002.1 JAGYYT010000002.1 0 55482364 SO:0000738 bac357b106d364f8dabc169545765f92 5 \N iajXIfPoEJDR8BdUaRcI6LhzyZmgtXoA 0 -3725169 JAGYYT010000003.1 JAGYYT010000003.1 0 24607739 SO:0000738 4ff9f520e63b14b64aede25a070baea9 5 \N CMg2W54uVjBnLWwm3xjUnBvyEXCvxyQh 0 -3725170 JAGYYT010000004.1 JAGYYT010000004.1 0 10690193 SO:0000738 361cc6768f00f6bbb45ff12756c76cd1 5 \N I7OKvWCdpSHkxh_LZv3bdpb5sLcX_3IB 0 -3725171 JAGYYT010000005.1 JAGYYT010000005.1 0 28045165 SO:0000738 725d218bfe0ce15239206f78f5604781 5 \N EdEqUHN3H05E-RlSmWi9SySm8JB90XzN 0 -3725172 JAGYYT010000006.1 JAGYYT010000006.1 0 158663023 SO:0000738 514c27535b3840a2a2ee405f08cb3446 5 \N vZtOo2P9FiBega_X8LSl_0xj4dmJZmMs 0 -3725173 JAGYYT010000007.1 JAGYYT010000007.1 0 46627313 SO:0000738 55301b7468e9cbc4d73a9253d752b652 5 \N HdIELGLdm6TPwPbrHfRlu_i2Nbs5w8Dp 0 -3725174 JAGYYT010000008.1 JAGYYT010000008.1 0 1975142 SO:0000738 a54f58b59c6061d0c44273897a7c148e 5 \N VLql4yEZYIlHmzeHQLmlVrwou_7rBDxu 0 -3725175 JAGYYT010000009.1 JAGYYT010000009.1 0 118296892 SO:0000738 ac1cbd267c1a1ce0eef59afd7fda6047 5 \N fTc-MuSRIDKYqLW4zFh7D9SR3UeAQ7tC 0 -3725176 JAGYYT010000010.1 JAGYYT010000010.1 0 8989820 SO:0000738 bbab8e1c1a52042b195975018ff7271e 5 \N NRaeEfjaqeSr3XXxJss_Az8kfYEiJGUA 0 -3727869 JAHAON010000001.1 JAHAON010000001.1 0 108267787 SO:0000738 2bfee5eba76ddf72a9ce1fe104dfb73a 6 \N nK5v5CSf3RTvSj3ynps2rwV9Qlwtq2XP 0 -3727870 JAHAON010000002.1 JAHAON010000002.1 0 102298096 SO:0000738 136adb262594356fde2be5bf32d091ea 6 \N wn703GYvypvgDg3Nwmg-MI6xqflTex53 0 -3727871 JAHAON010000003.1 JAHAON010000003.1 0 28141884 SO:0000738 d2044e19e173fc0af1271a71a20c19e7 6 \N eoWhZT0jMNBS9d50Sg02HkWe2TJA-Rga 0 -3727872 JAHAON010000004.1 JAHAON010000004.1 0 40727531 SO:0000738 c7a23b01e734c6b22fdd078e97c6f1da 6 \N 2klgq3Y-GrPMLPHsGfkeE7lMuAjbnaxe 0 -3727873 JAHAON010000005.1 JAHAON010000005.1 0 111718856 SO:0000738 c3acd9fa5d4a02da10007a9b71d49f0d 6 \N 4h8XGzCJ6_JvovqGcLZ4HGz-WMOcnfwY 0 -3727874 JAHAON010000006.1 JAHAON010000006.1 0 89895720 SO:0000738 c24fc746780a67eba106ec07563849f9 6 \N jSJ3PqRBMXsYUlSdMZUOagnljy_QRUFJ 0 -3727875 JAHAON010000007.1 JAHAON010000007.1 0 39819246 SO:0000738 08a128df8dd3c67aa90fe81490cf9a80 6 \N Ao235-Ye0mxGwEwvzEo94ejs8Gk2l72e 0 -3727876 JAHAON010000008.1 JAHAON010000008.1 0 32367248 SO:0000738 b7fbc0bd6188d59f429e25ee07ab0e8d 6 \N mSJYKw654SvQGfz9HKxNvDP1VWiSK3zp 0 -3727877 JAHAON010000009.1 JAHAON010000009.1 0 56661561 SO:0000738 7ae11c0c030017c8c2d35d67c9ac9316 6 \N Crj1lvdkJ4Tlm0Q8CEgkGZ6vmxadaopS 0 -3727878 JAHAON010000010.1 JAHAON010000010.1 0 139507333 SO:0000738 255d7b0f6a9f1f7d5f171b50eada5d6b 6 \N leJMsG-aQiRxi_QrgNomPJ3Wjpins2Ej 0 -3742614 JAGYVY010000001.1 JAGYVY010000001.1 0 51866122 SO:0000738 c67e160f076badff0d3c09289f711944 4 \N lGYmQZArBanljWYhufm3YzWp46jnEE39 0 -3742615 JAGYVY010000002.1 JAGYVY010000002.1 0 8986677 SO:0000738 c18c0a8433faef15c8947b862607f41e 4 \N 93cXvE8ygIE1LwyeejKgJ2jUBgi0fe8Y 0 -3742616 JAGYVY010000003.1 JAGYVY010000003.1 0 47249189 SO:0000738 59d40cdafc3b0d91fe836a49ffe7f591 4 \N recqxURiYRbP6f9yq6ck8pWH3o6dPXuh 0 -3742617 JAGYVY010000004.1 JAGYVY010000004.1 0 55363342 SO:0000738 3f9b3c2935d8657fadf86a9c8b6c44e3 4 \N wsdlsVY07wFILtVM4y1mQToK3WsC0x25 0 -3742618 JAGYVY010000005.1 JAGYVY010000005.1 0 12137054 SO:0000738 56b31b794c9d1115a51ae703cae480b1 4 \N QidyPw5USOpHj12iV3zUJp2mOERzzcMQ 0 -3742619 JAGYVY010000006.1 JAGYVY010000006.1 0 54505167 SO:0000738 7fb60047ffae535b15057cb346e543f2 4 \N gfqraPB64YaWSK9LuPzNybVqzY_kJzBv 0 -3742620 JAGYVY010000007.1 JAGYVY010000007.1 0 24869350 SO:0000738 2990dbdb7dd770e092a6baf2a4d57f04 4 \N EY9mIHACnljr9Akv3C9VuXbHTSDuY-rr 0 -3742621 JAGYVY010000008.1 JAGYVY010000008.1 0 42967410 SO:0000738 7fc5b200bbaae4ce510058280e83a2e9 4 \N 8VytWiKv7yC_sKVPrpIcMMFN_hCYU9dU 0 -3742622 JAGYVY010000009.1 JAGYVY010000009.1 0 18572896 SO:0000738 4087350722474ec6169ec1da9fca6e73 4 \N O2GMCPifUdXp2QpLqixsqWLWdJSLPRdv 0 -3742623 JAGYVY010000010.1 JAGYVY010000010.1 0 6843817 SO:0000738 f54904e00811656ff76eed21370fdfcc 4 \N IyX4rd_pWEXWQ0J8jsVFwKdedzCi9_oM 0 -3752279 JAHEPF010000001.1 JAHEPF010000001.1 0 34747916 SO:0000738 9cd36c56739382f5ccd8bf05d7b7a782 7 \N _g7GaDOEVsjK_hf11hZ4ky3pVZTIHINa 0 -3752280 JAHEPF010000002.1 JAHEPF010000002.1 0 35554520 SO:0000738 a3e576310e6fc76eb80a394291fb3204 7 \N dc-s25qNY-HORolo9d0iwoecf2ozWQlE 0 -3752281 JAHEPF010000003.1 JAHEPF010000003.1 0 33855561 SO:0000738 3d070bacf47cee60c022d565c170b6d4 7 \N YnmaKNuXOkW8WvsAMno_7XxJ3vFFCIMP 0 -3752282 JAHEPF010000004.1 JAHEPF010000004.1 0 5384975 SO:0000738 90425435effabb9ed63a2bb2b360a4b2 7 \N teu3AdwsFzzqP2CCTmxwxal5AiQxJX5D 0 -3752283 JAHEPF010000005.1 JAHEPF010000005.1 0 47328102 SO:0000738 4d3dcad460c7997151ab3caf27af487b 7 \N F9p0cz2HQjiWrzGFKJZbzAy5wknT13nd 0 -3752284 JAHEPF010000006.1 JAHEPF010000006.1 0 26659419 SO:0000738 48d097797ada812bfb466838fbddb0d7 7 \N L3gpNYdi6RFEMs_Pzkr_ZGo-E7pfiDFu 0 -3752285 JAHEPF010000007.1 JAHEPF010000007.1 0 20832236 SO:0000738 ca81ce1541e88ae3b27a4ab3a6190510 7 \N dxDQLWXbm7mipB0rFMhevOvyRuaTqnws 0 -3752286 JAHEPF010000008.1 JAHEPF010000008.1 0 27516148 SO:0000738 aff3c5deddf48410710cee142d10ba7e 7 \N mjrFMbEfGqydPt4vCe_azkbYwGYYpFnA 0 -3752287 JAHEPF010000009.1 JAHEPF010000009.1 0 12081732 SO:0000738 5b38d7b54682b639c168d7b661f9c876 7 \N qTEIGxHpPXRcqPNOhUBwVpSuWSVLCZyU 0 -3752288 JAHEPF010000010.1 JAHEPF010000010.1 0 4432623 SO:0000738 682b6eafb4b94dfad5d124873ac50812 7 \N nA8EZeMykBVcjHvhcIlhhiWM7ylPm-_g 0 -3760113 JAHKSD010000001.1 JAHKSD010000001.1 0 110635364 SO:0000738 3dc28bf6013947644e3aa841763c7631 9 \N igWakb948tcC73JOgGzs-SDwWLKKuleI 0 -3760114 JAHKSD010000002.1 JAHKSD010000002.1 0 1186550 SO:0000738 bc667c2ec5c2dc662a767e540fafa0c1 9 \N NFg11cJVWZmoQeeJR-oNyB5QT8Cg6_w- 0 -3760115 JAHKSD010000003.1 JAHKSD010000003.1 0 32898 SO:0000738 1d4ad8c5a00a00dbb6ad0b968dbe365f 9 \N GNtztMSKoX5-PG1zYvEE0qyowc8akI3J 0 -3760116 JAHKSD010000004.1 JAHKSD010000004.1 0 111658246 SO:0000738 88bb1aa0877ac906791c96551f542cef 9 \N Tu05HwWwxYR9xPqLU7QUnGrAOCKlMUmX 0 -3760117 JAHKSD010000005.1 JAHKSD010000005.1 0 139957525 SO:0000738 de84bdeaebb942f9f0ebc57fbe60680d 9 \N jphEshZT4l8fr4HMvXAwu6EsqM3Ud8YQ 0 -3760118 JAHKSD010000006.1 JAHKSD010000006.1 0 104451682 SO:0000738 d6a6387b078f4170e723032b48d7f8b6 9 \N SF8WSrHIwx3iITPRWUFqDHkZk6p35Rlu 0 -3760119 JAHKSD010000007.1 JAHKSD010000007.1 0 93427 SO:0000738 3eb7b09435ea2d5e3421cdb77f24fcb1 9 \N OvwIE2BbB6aKm0uNNq5cXZ9lSQpoqVIX 0 -3760120 JAHKSD010000008.1 JAHKSD010000008.1 0 50570566 SO:0000738 d4a40b2b51cd0291b7ec047ca614a953 9 \N zVEcf4soxkzJkLVNcib3nnGaPOxi4cBb 0 -3760121 JAHKSD010000009.1 JAHKSD010000009.1 0 1212238 SO:0000738 6b31e1467a52b4747751e3d155bde949 9 \N 6SQJJlA7VRnm-L_Pf2F-a6TqUnO1IbsW 0 -3760122 JAHKSD010000010.1 JAHKSD010000010.1 0 100646410 SO:0000738 9391399f48bde664b20f9b8dca808704 9 \N 07Ugr7jsN9jhBD3JbtYMNh79DDxOPjio 0 -3785686 1A 1A 1 594102056 SO:0000738 1e85cfd7774c4118a84f1dd62783b31d 79 1 d1TidPwqmfZ775SEnWe1DyCPcKNpYpFO 0 -3785687 1B 1B 1 689851870 SO:0000738 b917173c52104915e78845d137d922d0 79 2 8WfzIibnnlG1L1iNPZ3Sk0uiwIMK4znm 0 -3785688 1D 1D 1 495453186 SO:0000738 cef89d6e535210757cb10e504cbf9b03 79 3 y3u4DW3vBcXYTjtMBVhsyN7Ly7Rc2dFk 0 -3785689 2A 2A 1 780798557 SO:0000738 080bb4a5ff38e4849bf446fbbe40000a 79 4 2PQ-iGfRjPsojv1K9g18dQfDzNO2lyXq 0 -3785690 2B 2B 1 801256715 SO:0000738 8a52f592bb8a4f44438f7791dcca142a 79 5 keeRxrxBxos9oB3Adk47VryL12KtzINt 0 -3785691 2D 2D 1 651852609 SO:0000738 3fc8c6b5ea64445d7fba64ac55719895 79 6 Gt6hPn3IJboGQ-mwMXzSITaPuYAkfYiD 0 -3785692 3A 3A 1 750843639 SO:0000738 606b5e6749208700ccd9ec246449a1ac 79 7 Rm2Xzny0tMfgjPqmTa7EDn1BYJfcgk66 0 -3785693 3B 3B 1 830829764 SO:0000738 7bae7b0ef4dabf3d7456de792263713c 79 8 Xrjc9MtZuG34jFBE4xY6VuhGKa6G41ya 0 -3785694 3D 3D 1 615552423 SO:0000738 e7feee9ffc854a18889517e36b1fc257 79 9 s-CnQy24wXYDP0EsRUji7tvIkdfnF2qN 0 -3785695 4A 4A 1 744588157 SO:0000738 0f0ac12903101a6d0c6b417066f4fc5d 79 10 Qz1gdFRd4l6QXrOlcreln873gbns69Q0 0 diff --git a/src/tests/databases/ensembl_genome_metadata/attribute.txt b/src/tests/databases/ensembl_genome_metadata/attribute.txt deleted file mode 100644 index b65c6010..00000000 --- a/src/tests/databases/ensembl_genome_metadata/attribute.txt +++ /dev/null @@ -1,104 +0,0 @@ -1 assembly.accession assembly.accession assembly.accession string 1, -2 assembly.stats.chromosomes Chromosomes or plasmids Number of structures in cells containing DNA integer 0, -3 assembly.stats.component_sequences Component sequences Part of the primary sequences in assembly integer 0, -4 assembly.stats.contig_n50 Contig N50 Median size of contigs in a genome assembly bp 0, -5 assembly.date assembly.date assembly.date string 0, -6 assembly.default assembly.default assembly.default string 0, -7 assembly.stats.gc_percentage Average GC content Percentage of nucleotides in DNA that are G or C percent 0, -8 assembly.is_reference assembly.is_reference assembly.is_reference string 0, -9 assembly.level assembly.level assembly.level string 0, -10 assembly.mapping assembly.mapping assembly.mapping string 0, -11 assembly.name assembly.name assembly.name string 1, -12 assembly.provider_name assembly.provider_name assembly.provider_name string 0, -13 assembly.provider_url assembly.provider_url assembly.provider_url string 0, -14 assembly.stats.spanned_gaps Spanned gaps Number of gaps covered by sequencing reads integer 0, -15 assembly.tolid assembly.tolid assembly.tolid string 0, -16 assembly.stats.toplevel_sequences Top level sequences Primary sequences in a genome assembly integer 0, -17 assembly.stats.total_coding_sequence_length Total coding sequence length Total length of all coding sequences bp 0, -18 assembly.stats.total_gap_length Total gap length Total length of all gaps in a genome assembly bp 0, -19 assembly.stats.total_genome_length Total genome length Total length of all genomic sequences bp 0, -20 assembly.ucsc_alias assembly.ucsc_alias assembly.ucsc_alias string 0, -21 genebuild.stats.average_cds_length Average CDS length Average length of coding sequences float 0, -22 genebuild.stats.average_coding_exons_per_coding_gene Average coding exons per coding gene Average coding exons per coding gene string 0, -23 genebuild.stats.average_coding_exons_per_transcript Average coding exons per transcript Average coding exons per coding transcript float 0, -24 genebuild.stats.average_coding_exon_length Average exon length per coding gene Average length of coding exons bp 0, -25 genebuild.stats.average_exon_length Average exon length Average length of exons bp 0, -26 genebuild.stats.average_genomic_span Average coding genomic span Average length of all genomic regions bp 0, -27 genebuild.stats.average_intron_length Average intron length Average intron length per coding gene bp 0, -28 genebuild.stats.average_sequence_legth Average coding sequence length Average length of sequences in genome bp 0, -29 genebuild.stats.coding_genes Coding genes Genes that code for proteins integer 0, -30 genebuild.stats.coding_transcripts Coding transcripts Transcripts that code for proteins integer 0, -31 genebuild.stats.coding_transcripts_per_gene Average coding transcripts per gene Average coding transcripts per gene float 0, -32 genebuild.hash genebuild.hash genebuild.hash string 0, -33 genebuild.initial_release_date genebuild.initial_release_date genebuild.initial_release_date string 0, -34 genebuild.last_geneset_update genebuild.last_geneset_update genebuild.last_geneset_update string 1, -35 genebuild.level genebuild.level genebuild.level string 0, -36 genebuild.longest_gene_length Longest coding gene Length of longest gene bp 0, -37 genebuild.method genebuild.method genebuild.method string 0, -38 genebuild.method_display genebuild.method_display genebuild.method_display string 0, -39 genebuild.stats.nc_average_exons_per_transcript Average exons per non-coding transcript Mean exon count per transcript float 0, -40 genebuild.stats.nc_average_exon_length Average exon length per non-coding transcript Mean exon length bp 0, -41 genebuild.stats.nc_average_genomic_span Average non-coding genomic span Mean length of all genomic regions bp 0, -42 genebuild.stats.nc_average_sequence_length Average non-coding sequence length Mean length of all sequences bp 0, -43 genebuild.stats.nc_longest_gene_length Longest non-coding gene Length of longest non-coding gene bp 0, -44 genebuild.stats.nc_long_non_coding_genes Long non-coding genes Long genes not coding for proteins integer 0, -45 genebuild.stats.nc_misc_non_coding_genes Misc. non-coding genes Miscellaneous non-coding genes integer 0, -46 genebuild.stats.nc_non_coding_genes Non-coding genes Genes that don't code for proteins integer 0, -47 genebuild.stats.nc_shortest_gene_length Shortest non-coding gene Length of shortest gene bp 0, -48 genebuild.stats.nc_small_non_coding_genes Small non-coding genes Small genes not coding for proteins integer 0, -49 genebuild.stats.nc_total_introns Introns in non-coding genes Total intron count integer 0, -50 genebuild.stats.nc_total_transcripts Non-coding transcripts Total RNA transcript count integer 0, -51 genebuild.stats.nc_transcripts_per_gene Average transcripts per non-coding gene Mean transcripts count per gene float 0, -52 genebuild.stats.ps_average_exons_per_transcript Average intron length per pseudogene Mean exon count per pseudogene transcript float 0, -53 genebuild.stats.ps_average_exon_length Average exon length per pseudogene Mean pseudogene exon length bp 0, -54 genebuild.stats.ps_average_genomic_span Average pseudogene genomic span Mean length of pseudogene regions bp 0, -55 genebuild.stats.ps_average_intron_length Average intron length per pseudogene Mean pseudogene intron length bp 0, -56 genebuild.stats.ps_average_sequence_length Average pseudogene sequence length Mean length of pseudogene sequences bp 0, -57 genebuild.stats.ps_longest_gene_length Longest pseudogene Length of longest pseudogene bp 0, -58 genebuild.stats.ps_pseudogenes Pseudogenes Genes which don't code functional protiens integer 0, -59 genebuild.stats.ps_shortest_gene_length Shortest pseudogene Length of shortest pseudogene bp 0, -60 genebuild.stats.ps_total_exons Exons in pseudogenes Total exon count in pseudogenes integer 0, -61 genebuild.stats.ps_total_introns Introns in pseudogenes Total intron count in pseudogenes integer 0, -62 genebuild.stats.ps_total_transcripts Transcripts in pseudogenes Total pseudogene RNA transcript count integer 0, -63 genebuild.stats.ps_transcripts_per_gene Average transcripts per pseudogene Mean pseudogene transcripts count per pseudogene float 0, -64 genebuild.stats.shortest_gene_length Shortest coding gene Length of shortest gene bp 0, -65 genebuild.start_date genebuild.start_date genebuild.start_date string 1, -66 genebuild.stats.total_coding_exons Exons in coding genes Total number of coding exons integer 0, -67 genebuild.stats.total_exons Exons in genes Total number of exons integer 0, -68 genebuild.stats.total_introns Introns in coding genes Total number of introns integer 0, -69 genebuild.stats.total_transcripts Transcripts in coding genes Total number of RNA transcripts integer 0, -70 genebuild.stats.transcripts_per_gene Average transcripts per coding gene Average number of transcripts per gene float 0, -71 genebuild.version genebuild.version genebuild.version string 1, -72 genebuild.sample_gene genebuild.sample_gene Sample Gene Data string 1, -73 genebuild.sample_location genebuild.sample_location Sample Location Data string 1, -74 assembly.stats.coverage_depth assembly.coverage_depth assembly.coverage_depth string 0, -75 assembly.web_accession_source assembly.web_accession_source assembly.web_accession_source string 0, -76 assembly.web_accession_type assembly.web_accession_type assembly.web_accession_type string 0, -77 genebuild.id genebuild.id genebuild.id string 0, -78 genebuild.stats.nc_average_intron_length Average intron length per non-coding transcript Mean intron length bp 0, -79 genebuild.projection_source_db genebuild.projection_source_db genebuild.projection_source_db string 0, -80 assembly.long_name assembly.long_name assembly.long_name string 0, -81 assembly.url_name assembly.url_name assembly.url_name string 0, -82 genebuild.havana_datafreeze_date genebuild.havana_datafreeze_date genebuild.havana_datafreeze_date string 0, -83 assembly.version assembly.version assembly.version string 0, -84 genebuild.provider_name genebuild.provider_name genebuild.provider_name string 1, -85 genebuild.provider_url genebuild.provider_url genebuild.provider_url string 1, -119 variation.short_variants Short variants Small-scale genetic variations integer 0, -120 variation.sample_variant variation.sample_variant variation.sample_variant string 0, -123 variation.short_variants_with_phenotype_assertions Short variation with phenotype assertion Short variation with phenotype assertion string 0, -161 compara.stats.homology_coverage compara.homology_coverage compara.homology_coverage float 0, -162 compara.homology_reference_species compara.homology_reference_species compara.homology_reference_species string 0, -163 regulation.stats.open_chromatin_count regulation.open_chromatin_count Number of open chromatin regions integer 0, -164 regulation.stats.promoter_count regulation.promoter_count Number of promoters integer 0, -165 regulation.stats.enhancer_count regulation.enhancer_count Number of enhancers integer 0, -166 regulation.stats.ctcf_count regulation.ctcf_count Number of CTCF binding sites integer 0, -167 regulation.stats.tfbs_count regulation.tfbs_count Number of regions enriched for transcription factor binding integer 0, -168 assembly.tol_id assembly.tol_id assembly.tol_id string 0, -169 genebuild.annotation_source genebuild.annotation_source genebuild.annotation_source string 1, -170 genebuild.stats.nc_total_exons Exons in non-coding genes Total exon count integer 0, -179 assembly.description assembly.description assembly.description string 0, -180 assembly.master_accession assembly.master_accession assembly.master_accession string 0, -181 assembly.alt_accession assembly.alt_accession assembly.alt_accession string 0, -182 dataset.build_start Dataset Build start date Dataset Build start date string 0, -183 dataset.build_end Dataset Build completed Dataset Build completed string 0, -197 genebuild.provider_version genebuild.provider_version genebuild.provider_version string 0, diff --git a/src/tests/databases/ensembl_genome_metadata/dataset.txt b/src/tests/databases/ensembl_genome_metadata/dataset.txt deleted file mode 100644 index 77181619..00000000 --- a/src/tests/databases/ensembl_genome_metadata/dataset.txt +++ /dev/null @@ -1,499 +0,0 @@ -1 02104faf-3fee-4f28-b53c-605843dac941 assembly \N 2023-09-22 15:01:44.000000 GCA_000005845.2 1 1 Released \N -2 cc3c7f95-b5dc-4cc1-aa15-2817c89bd1e2 genebuild EXT01 2023-09-22 15:01:44.000000 GCA_000005845.2_EXT01 1 2 Released \N -7 dc1b508e-f148-4a6f-b051-1a0c53142cf5 assembly \N 2023-09-22 15:02:00.000000 GCA_018473315.1 4 1 Released \N -9 45aec801-4fe7-4ac2-9afa-19aea2a8409e assembly \N 2023-09-22 15:02:01.000000 GCA_018469415.1 5 1 Released \N -11 5cda83f4-521c-4713-b2fe-b3ede2f1a51e assembly \N 2023-09-22 15:02:02.000000 GCA_018469875.1 6 1 Processed \N -13 3f9bf8d6-1514-4657-9f73-38a7354a80b8 assembly \N 2023-09-22 15:02:04.000000 GCA_018505825.1 7 1 Processed \N -14 53936715-1371-4343-95af-f39d06943db7 genebuild ENS01 2023-09-22 15:02:04.000000 GCA_018505825.1_ENS01 7 2 Processed \N -17 d641779c-2add-46ce-acf4-a2b6f15274b1 assembly \N 2023-09-22 15:02:11.000000 GCA_018852615.1 9 1 Processed \N -23 06b4892b-8e34-49bc-be84-8126e5a7cf93 assembly \N 2023-09-22 15:03:01.000000 GCA_000002765.2 14 1 Released \N -24 f202cd36-d0dc-40df-9dd6-a8218e0d1366 genebuild EXT01 2023-09-22 15:03:01.000000 GCA_000002765.2_EXT01 14 2 Released \N -37 6f8bd121-0345-4b77-9dc1-d567ac13447d assembly \N 2023-09-22 15:03:02.000000 GCA_021950905.1 18 1 Processed \N -38 2ef7c056-847e-4742-a68b-18c3ece068aa genebuild ENS01 2023-09-22 15:03:02.000000 GCA_021950905.1_ENS01 18 2 Submitted \N -61 3474e0d6-d031-40bc-a4ae-230236886568 assembly \N 2023-09-22 15:03:22.000000 GCA_000001405.14 40 1 Released \N -62 e456d1c2-eea2-40f1-83ee-31912905b695 genebuild GENCODE19 2023-09-22 15:03:22.000000 GCA_000001405.14_GENCODE19 40 2 Released \N -147 999315f6-6d25-481f-a017-297f7e1490c8 assembly \N 2023-09-22 15:04:29.000000 GCA_900519105.1 79 1 Released \N -148 287a5483-55a4-46e6-a58b-a84ba0ddacd6 genebuild EXT01 2023-09-22 15:04:29.000000 GCA_900519105.1_EXT01 79 2 Released \N -171 c813f7b7-645c-45ac-8536-08190fd7daa0 assembly \N 2023-09-22 15:04:45.000000 GCA_000001405.29 92 1 Released \N -172 949defef-c4d2-4ab1-8a73-f41d2b3c7719 genebuild GENCODE44 2023-09-22 15:04:45.000000 GCA_000001405.29_GENCODE44 92 2 Released \N -177 3c67123a-e9e1-41ef-9014-2aadc8acf12a assembly \N 2023-09-22 15:04:50.000000 GCA_018505865.1 97 1 Processed \N -178 ed8ca387-38e3-4bfe-8b85-757a59b95126 genebuild ENS01 2023-09-22 15:04:50.000000 GCA_018505865.1_ENS01 97 2 Processed \N -183 8801edaf-86ec-4799-8fd4-a59077f04c05 assembly \N 2023-09-22 15:04:53.000000 GCA_018852605.1 100 1 Processed \N -184 11a0be7f-99ae-45d3-a004-dc19bb562330 genebuild ENS01 2023-09-22 15:04:53.000000 GCA_018852605.1_ENS01 100 2 Processed \N -197 fd27883a-e5d3-4502-b774-65d3cc4f4e18 assembly \N 2023-09-22 15:04:56.000000 GCA_018469925.1 107 1 Processed \N -249 786344d1-a71f-4bab-aa37-6ee315ed60a4 assembly \N 2023-09-22 15:05:37.000000 GCA_018469425.1 135 1 Processed \N -250 2bc8874e-6672-4293-89d6-0b837005177c genebuild ENS01 2023-09-22 15:05:37.000000 GCA_018469425.1_ENS01 135 2 Processed \N -337 eb451e00-7abb-4462-82bf-f29f6ed3dc1b assembly \N 2023-09-22 15:06:39.000000 GCA_021951015.1 179 1 Processed \N -338 bd63a676-45ff-494a-b26f-2b779cb6c180 genebuild ENS01 2023-09-22 15:06:39.000000 GCA_021951015.1_ENS01 179 2 Processed \N -347 6790a2a6-b178-4ab2-a12b-aad3d5511713 assembly \N 2023-09-22 15:06:43.000000 GCA_018473295.1 185 1 Released \N -348 23d52e01-2e3d-495f-b345-df41c605caa9 genebuild ENS01 2023-09-22 15:06:43.000000 GCA_018473295.1_ENS01 185 2 Released \N -401 3b58ee8a-8f8d-4dfe-bb58-44c2ed57f229 assembly \N 2023-09-22 15:06:55.000000 GCA_000146045.2 214 1 Released \N -402 cfef61f8-7e24-4ed6-945f-baca1b2664a3 genebuild EXT01 2023-09-22 15:06:55.000000 GCA_000146045.2_EXT01 214 2 Released \N -405 6c1896f9-10dd-423e-a1ff-db8b5815cb66 assembly \N 2023-09-22 15:06:58.000000 GCA_000002985.3 217 1 Released \N -406 ea69f164-cc77-4671-bf97-c7f537dc400e genebuild EXT01 2023-09-22 15:06:58.000000 GCA_000002985.3_EXT01 217 2 Released \N -888 9d717ead-ffe0-4fc1-b58c-3c057b754021 genebuild ENS01 2023-11-07 11:18:55.000000 GCA_018473315.1_ENS01 4 2 Released \N -890 1c759aca-63a1-4eea-abe8-ef5f298fe6e2 genebuild ENS01 2023-11-07 11:18:57.000000 GCA_018469415.1_ENS01 5 2 Released \N -892 664088c7-356e-418c-adb2-15945b7ebc4b genebuild ENS01 2023-11-07 11:19:00.000000 GCA_018469875.1_ENS01 6 2 Processed \N -896 f9690d7e-26c1-459d-8102-0c4a1a468806 genebuild ENS01 2023-11-07 11:19:16.000000 GCA_018852615.1_ENS01 9 2 Processed \N -1006 66db32ae-974f-480c-a60b-63cc49d00f68 genebuild ENS01 2023-11-07 11:22:53.000000 GCA_018469925.1_ENS01 107 2 Processed \N -1391 bf1f5064-8520-4f19-84e4-449aa6c1c1e2 variation 1.0 2023-11-09 12:49:25.273751 GRCh38 673 3 Released \N -1392 5b869bbb-098f-4827-afc0-532a2bc88903 variation 1.0 2023-11-09 12:49:25.797822 GRCh37 674 3 Released \N -1393 5717d47e-dad9-4c2d-b015-c055bc93e831 evidence 1.0 2023-11-09 12:49:26.267728 GRCh38 673 4 Released \N -1394 e0202b0e-578d-493f-aeb4-464f5d2e233a evidence 1.0 2023-11-09 12:49:26.756909 GRCh37 674 4 Released \N -1405 bfa00db1-7925-4caa-8c7c-fa48bc5701a5 evidence 1.0 2023-11-09 12:49:31.245732 R64-1-1 644 4 Released \N -1414 c9d18d30-f7ad-44f4-becf-9b2c3606ac4f evidence 1.0 2023-11-09 12:49:34.817886 ASM276v2 653 4 Released \N -1421 80babe97-b289-407d-af70-b46ff5478f2e evidence 1.0 2023-11-09 12:49:37.663593 IWGSC 660 4 Released \N -1464 5c2d6ef7-fe03-4f1a-bcc2-fb72af9ffa46 variation 1.0 2023-11-09 12:49:58.762831 HG03516.pri.mat.f1_v2 565 3 Processed \N -1470 69f38cd5-5774-4b29-9ef4-181441ca0eb8 variation 1.0 2023-11-09 12:50:01.822030 HG02622.alt.pat.f1_v2 571 3 Processed \N -1475 08543d8d-2110-46f3-a9b6-ac58c4af8202 variation 1.0 2023-11-09 12:50:04.431570 HG03540.alt.pat.f1_v2 576 3 Released \N -1491 a4c858c4-2e55-43c9-af8b-7adc7ce24c58 variation 1.0 2023-11-09 12:50:12.565449 HG02622.pri.mat.f1_v2 592 3 Processed \N -1494 79e1a7b8-85dc-4bfe-b606-6992ca109ede variation 1.0 2023-11-09 12:50:14.106939 HG02109.pri.mat.f1_v2 595 3 Processed \N -1496 0c1cdfea-cb10-4a08-8f75-4158658d6a02 variation 1.0 2023-11-09 12:50:15.157829 HG02109.alt.pat.f1_v2 597 3 Processed \N -1507 dd28ea84-6890-4af8-bd2b-2caa9917f221 variation 1.0 2023-11-09 12:50:20.833289 HG03516.alt.pat.f1_v2 608 3 Released \N -1509 b876cb36-6e84-4a2c-8af2-14e096d48df9 variation 1.0 2023-11-09 12:50:21.809661 HG03540.pri.mat.f1_v2 610 3 Released \N -1528 0a0bed83-72c7-4f8a-a1cb-97450ef82495 variation 1.0 2023-11-09 12:50:31.531084 R64-1-1 644 3 Released \N -1537 ff7cb333-fc39-4f00-93e0-65a0d5eb596b variation 1.0 2023-11-09 12:50:36.212197 ASM276v2 653 3 Released \N -1544 e659bef9-22f7-4ad2-8215-4a48ecd228df variation 1.0 2023-11-09 12:50:39.743563 IWGSC 660 3 Released \N -2276 0bdc7428-6f81-4c96-a8d3-f5d5bff5a9b8 compara_homologies 1.0 2023-11-14 16:49:59.780613 Compara homologies 251 6 Released \N -2291 4b02b11e-397c-4a4f-8c13-8b65efb87030 compara_homologies 1.0 2023-11-14 16:50:13.529466 Compara homologies 266 6 Released \N -2319 35f56606-9186-432f-b033-6e7204708f3b compara_homologies 1.0 2023-11-14 16:50:38.690925 Compara homologies 294 6 Processed \N -2348 aebf0b81-4234-4aa9-85cc-abfe91f5eac2 compara_homologies 1.0 2023-11-14 16:51:03.416326 Compara homologies 323 6 Released \N -2357 caac6097-4921-4c10-bfc0-1c3e9b2604dc compara_homologies 1.0 2023-11-14 16:51:11.265120 Compara homologies 332 6 Released \N -2384 9f45f1a6-d4d0-4c02-9509-dec5a0d523fb compara_homologies 1.0 2023-11-14 16:51:33.936250 Compara homologies 359 6 Released \N -2394 b67e1761-3341-4965-9a5b-041cb8230cb3 compara_homologies 1.0 2023-11-14 16:51:42.824153 Compara homologies 369 6 Released \N -2408 0571d77c-5cc6-4819-80bf-34a42acfc3f6 compara_homologies 1.0 2023-11-14 16:51:54.939579 Compara homologies 383 6 Released \N -2449 e6df4d05-8567-4143-8ea0-c6ad1b5a3fc2 compara_homologies 1.0 2023-11-14 16:52:30.671955 Compara homologies 424 6 Released \N -2454 f2734f34-36a0-4594-871d-f7f6d317d05a compara_homologies 1.0 2023-11-14 16:52:35.328579 Compara homologies 429 6 Submitted \N -2458 58df568e-48c1-4a3b-838b-448540392f9c compara_homologies 1.0 2023-11-14 16:52:38.647877 Compara homologies 433 6 Released \N -2482 832c1885-6b98-4bf0-b4cf-70e317399bd4 compara_homologies 1.0 2023-11-14 16:52:59.648742 Compara homologies 457 6 Released \N -2494 f32b7f9a-97fd-41cd-86be-a5fb5becd335 compara_homologies 1.0 2023-11-14 16:53:10.265485 Compara homologies 469 6 Processed \N -2518 679d6452-799c-4a2f-8906-0db6c639e498 regulatory_features 1.0 2023-11-15 15:07:12.410801 Regulatory Annotation 670 7 Released \N -6593 8ebbce8e-dcc7-49f8-b520-4d479aef2a65 compara_homologies 1.0 2024-02-06 21:29:06.353565 Compara homologies 457 6 Processed \N -6623 60fae37a-afe2-4bb1-9da0-321911dd2856 compara_homologies 1.0 2024-02-06 21:29:21.311224 Compara homologies 266 6 Processed \N -6699 06debe4a-2b3e-4fab-991e-26863dbe8af8 compara_homologies 1.0 2024-02-06 21:29:59.006535 Compara homologies 323 6 Processed \N -6849 f93d21ca-9a24-4c31-ae11-b0f8d3deab6d compara_homologies 1.0 2024-02-06 21:31:13.258218 Compara homologies 423 6 Submitted \N -6896 f3abf167-6a8f-45cc-b753-22a955123758 compara_homologies 1.0 2024-02-06 21:31:36.368299 Compara homologies 387 6 Processed \N -7069 9681f4c2-afb4-4a08-8e4d-f26363f65ddf compara_homologies 1.0 2024-02-06 21:33:02.240258 Compara homologies 369 6 Processed \N -7177 d57040b6-0ef5-4e6b-97ef-be0ad94d3a61 compara_homologies 1.0 2024-02-06 21:33:55.672317 Compara homologies 424 6 Processed \N -7320 c4f0eec5-f4b3-4cef-8369-7b13932509e1 compara_homologies 1.0 2024-02-06 21:35:06.396207 Compara homologies 284 6 Processed \N -7535 d51ab85a-f037-47a3-ba53-423ad8e42669 compara_homologies 1.0 2024-02-06 21:36:52.499221 Compara homologies 383 6 Processed \N -7603 254a68c7-f512-446d-a958-983a2713daf2 compara_homologies 1.0 2024-02-06 21:37:25.921259 Compara homologies 359 6 Processed \N -7785 9f2a7c92-e04a-443f-a991-1481a9466456 compara_homologies 1.0 2024-02-06 21:38:56.300529 Compara homologies 313 6 Processed \N -7820 681ceca3-3336-4b92-ac11-43b5fcabec62 compara_homologies 1.0 2024-02-06 21:39:13.565274 Compara homologies 251 6 Processed \N -7847 d78259af-f491-42de-9cbf-de744b09efee compara_homologies 1.0 2024-02-06 21:39:26.897641 Compara homologies 332 6 Processed \N -8130 5b618784-a5ff-46cc-8102-b082ffb6e447 compara_homologies 1.0 2024-02-06 21:41:47.150011 Compara homologies 368 6 Submitted \N -8392 b6472939-9e49-4d46-b93e-304910acabf3 compara_homologies 1.0 2024-02-06 21:44:00.982498 Compara homologies 4352 6 Processed \N -8661 a5bf42be-63c1-4616-9af1-bc03aea92643 compara_homologies 1.0 2024-02-06 21:46:14.099319 Compara homologies 443 6 Submitted \N -8662 af8eee44-ca56-4baf-a5f1-ad60d1165f3a genebuild_compute ENS01 2024-04-24 10:18:12.000000 From 23d52e01-2e3d-495f-b345-df41c605caa9 185 8 Released 348 -8663 a0fef323-23b9-4d4c-87b3-42f290dffbc7 xrefs ENS01 2024-04-24 10:18:12.000000 From af8eee44-ca56-4baf-a5f1-ad60d1165f3a 185 13 Released 8662 -8664 5d12b439-f994-408b-a7cc-88a0ce2a1c5e protein_features ENS01 2024-04-24 10:18:12.000000 From af8eee44-ca56-4baf-a5f1-ad60d1165f3a 185 14 Released 8662 -8665 fd7c81b9-bd96-4074-a78f-ce86059d3a55 alpha_fold ENS01 2024-04-24 10:18:12.000000 From af8eee44-ca56-4baf-a5f1-ad60d1165f3a 185 15 Released 8662 -8666 e4630a33-1d85-4a93-9c3d-ba23f531e900 genebuild_files ENS01 2024-04-24 10:18:12.000000 From 23d52e01-2e3d-495f-b345-df41c605caa9 185 9 Released 348 -8667 5dc9ebba-a6f0-4380-9f9b-5735855c8c0b blast ENS01 2024-04-24 10:18:12.000000 From e4630a33-1d85-4a93-9c3d-ba23f531e900 185 16 Released 8666 -8668 12304159-93ac-4597-bbfb-fc487a580bd2 ftp_dumps ENS01 2024-04-24 10:18:12.000000 From e4630a33-1d85-4a93-9c3d-ba23f531e900 185 17 Released 8666 -8669 aaf2c600-821f-4ade-a3e7-fde21c333060 genebuild_web ENS01 2024-04-24 10:18:12.000000 From 23d52e01-2e3d-495f-b345-df41c605caa9 185 11 Released 348 -8670 97e9f169-4ba2-4d44-b958-7fc3233c2c24 thoas_dumps ENS01 2024-04-24 10:18:12.000000 From aaf2c600-821f-4ade-a3e7-fde21c333060 185 18 Released 8669 -8671 45c72ec3-7b5e-4b5e-83f8-1fc5790b1ad4 browser_files ENS01 2024-04-24 10:18:12.000000 From aaf2c600-821f-4ade-a3e7-fde21c333060 185 20 Released 8669 -8672 7a33c596-7883-4638-86d4-9aa4ac266110 checksums ENS01 2024-04-24 10:18:12.000000 From aaf2c600-821f-4ade-a3e7-fde21c333060 185 23 Released 8669 -8673 d6f3a493-9abf-4cef-81b7-dbf7bd0a8c4c genebuild_web ENS01 2024-04-24 10:18:12.000000 From 23d52e01-2e3d-495f-b345-df41c605caa9 185 12 Released 348 -8674 1ff09568-0971-4907-b023-2e81b9d73e61 thoas_load ENS01 2024-04-24 10:18:12.000000 From d6f3a493-9abf-4cef-81b7-dbf7bd0a8c4c 185 19 Released 8673 -8675 ff3d51eb-154b-4665-887c-c406cc3bc78b genebuild_track ENS01 2024-04-24 10:18:12.000000 From d6f3a493-9abf-4cef-81b7-dbf7bd0a8c4c 185 21 Released 8673 -8676 76bc09f9-ab83-49ad-9b14-b81dd2ee5eb0 refget_load ENS01 2024-04-24 10:18:12.000000 From d6f3a493-9abf-4cef-81b7-dbf7bd0a8c4c 185 24 Released 8673 -8677 5fb17152-b5bd-4a8f-8b00-1e87b1bdd036 genebuild_compute ENS01 2024-04-24 10:18:12.000000 From 9d717ead-ffe0-4fc1-b58c-3c057b754021 4 8 Released 888 -8678 8a49f103-b405-4f54-8714-980007cfe776 xrefs ENS01 2024-04-24 10:18:12.000000 From 5fb17152-b5bd-4a8f-8b00-1e87b1bdd036 4 13 Released 8677 -8679 129e0e2b-b778-47d5-9252-822af8adbf5b protein_features ENS01 2024-04-24 10:18:12.000000 From 5fb17152-b5bd-4a8f-8b00-1e87b1bdd036 4 14 Released 8677 -8680 d811d1ff-8e73-4215-b622-4da5d1ae68bc alpha_fold ENS01 2024-04-24 10:18:12.000000 From 5fb17152-b5bd-4a8f-8b00-1e87b1bdd036 4 15 Released 8677 -8681 b57a6524-00c5-423a-b569-57e2039d5f75 genebuild_files ENS01 2024-04-24 10:18:12.000000 From 9d717ead-ffe0-4fc1-b58c-3c057b754021 4 9 Released 888 -8682 90d32255-8476-4d33-8aca-cd05b65f6b6f blast ENS01 2024-04-24 10:18:12.000000 From b57a6524-00c5-423a-b569-57e2039d5f75 4 16 Released 8681 -8683 3243f1ac-38aa-412b-9a2a-c5edf0336a2f ftp_dumps ENS01 2024-04-24 10:18:12.000000 From b57a6524-00c5-423a-b569-57e2039d5f75 4 17 Released 8681 -8684 c68f4857-0f85-48eb-a9d1-37e7e9cb5d71 genebuild_web ENS01 2024-04-24 10:18:12.000000 From 9d717ead-ffe0-4fc1-b58c-3c057b754021 4 11 Released 888 -8685 98cb07a1-2a1d-496a-a0a7-168662eda07b thoas_dumps ENS01 2024-04-24 10:18:12.000000 From c68f4857-0f85-48eb-a9d1-37e7e9cb5d71 4 18 Released 8684 -8686 3b9fee1b-0c94-4345-9599-919ad721a7da browser_files ENS01 2024-04-24 10:18:12.000000 From c68f4857-0f85-48eb-a9d1-37e7e9cb5d71 4 20 Released 8684 -8687 8978bd71-c1b1-40b5-8628-1dd84115badd checksums ENS01 2024-04-24 10:18:12.000000 From c68f4857-0f85-48eb-a9d1-37e7e9cb5d71 4 23 Released 8684 -8688 17e767e8-4f6a-40a2-aaa1-ee4f03e37567 genebuild_web ENS01 2024-04-24 10:18:12.000000 From 9d717ead-ffe0-4fc1-b58c-3c057b754021 4 12 Released 888 -8689 53979850-c127-4a85-a680-9183978bb250 thoas_load ENS01 2024-04-24 10:18:12.000000 From 17e767e8-4f6a-40a2-aaa1-ee4f03e37567 4 19 Released 8688 -8690 2b8d9066-8a02-4d47-ab29-c39f43ccfc53 genebuild_track ENS01 2024-04-24 10:18:12.000000 From 17e767e8-4f6a-40a2-aaa1-ee4f03e37567 4 21 Released 8688 -8691 4d418be3-ea1b-4f36-afa4-c40d113b3910 refget_load ENS01 2024-04-24 10:18:12.000000 From 17e767e8-4f6a-40a2-aaa1-ee4f03e37567 4 24 Released 8688 -8692 3cfe16ce-8a7e-49c8-b719-2affce984771 genebuild_compute ENS01 2024-04-24 10:18:13.000000 From 1c759aca-63a1-4eea-abe8-ef5f298fe6e2 5 8 Released 890 -8693 f5eeacaa-8ee4-4739-8aed-a6aeaaadd13e xrefs ENS01 2024-04-24 10:18:13.000000 From 3cfe16ce-8a7e-49c8-b719-2affce984771 5 13 Released 8692 -8694 808c43bc-7950-4e32-935b-ef13f1a869c1 protein_features ENS01 2024-04-24 10:18:13.000000 From 3cfe16ce-8a7e-49c8-b719-2affce984771 5 14 Released 8692 -8695 7413ae88-d840-47ca-a602-713e03e6b123 alpha_fold ENS01 2024-04-24 10:18:13.000000 From 3cfe16ce-8a7e-49c8-b719-2affce984771 5 15 Released 8692 -8696 6a5f98e4-2f2f-4c88-9172-d02dc623c42f genebuild_files ENS01 2024-04-24 10:18:13.000000 From 1c759aca-63a1-4eea-abe8-ef5f298fe6e2 5 9 Released 890 -8697 cd9f881d-1716-425d-9879-472193cbf337 blast ENS01 2024-04-24 10:18:13.000000 From 6a5f98e4-2f2f-4c88-9172-d02dc623c42f 5 16 Released 8696 -8698 84d2b150-1e5b-49b2-a35c-14596d503ae0 ftp_dumps ENS01 2024-04-24 10:18:13.000000 From 6a5f98e4-2f2f-4c88-9172-d02dc623c42f 5 17 Released 8696 -8699 5b63c887-d867-411c-b138-536ed0c430de genebuild_web ENS01 2024-04-24 10:18:13.000000 From 1c759aca-63a1-4eea-abe8-ef5f298fe6e2 5 11 Released 890 -8700 3a7c3603-acfa-4803-8c0c-c413501c6180 thoas_dumps ENS01 2024-04-24 10:18:13.000000 From 5b63c887-d867-411c-b138-536ed0c430de 5 18 Released 8699 -8701 6a52d021-242b-4e7a-84aa-b2f08d6b1e89 browser_files ENS01 2024-04-24 10:18:13.000000 From 5b63c887-d867-411c-b138-536ed0c430de 5 20 Released 8699 -8702 30f5310e-7e78-4027-aa32-82de71946e20 checksums ENS01 2024-04-24 10:18:13.000000 From 5b63c887-d867-411c-b138-536ed0c430de 5 23 Released 8699 -8703 f67689b2-8c52-4c3e-89da-70520e0613d8 genebuild_web ENS01 2024-04-24 10:18:13.000000 From 1c759aca-63a1-4eea-abe8-ef5f298fe6e2 5 12 Released 890 -8704 e4e75159-a56c-4a38-ac81-b74f7e89c022 thoas_load ENS01 2024-04-24 10:18:13.000000 From f67689b2-8c52-4c3e-89da-70520e0613d8 5 19 Released 8703 -8705 981eb155-b2c5-4571-955f-f2d7574ef5eb genebuild_track ENS01 2024-04-24 10:18:13.000000 From f67689b2-8c52-4c3e-89da-70520e0613d8 5 21 Released 8703 -8706 078f49f0-cc6e-4e4a-bfee-f8fc240a635a refget_load ENS01 2024-04-24 10:18:13.000000 From f67689b2-8c52-4c3e-89da-70520e0613d8 5 24 Released 8703 -8707 705c3da3-186f-42f7-bd2d-795285e9b246 evidence 1.0 2024-04-24 10:18:13.000000 From 08543d8d-2110-46f3-a9b6-ac58c4af8202 576 4 Released 1475 -8708 bb98fc3a-30af-41b2-9dac-29d580b42b68 short_variant 1.0 2024-04-24 10:18:13.000000 From 08543d8d-2110-46f3-a9b6-ac58c4af8202 576 5 Released 1475 -8709 4b9b9585-8570-4f26-8819-a45f92df8d23 variation_ftp 1.0 2024-04-24 10:18:13.000000 From 08543d8d-2110-46f3-a9b6-ac58c4af8202 576 29 Released 1475 -8710 7ca071a6-2ab3-4535-8c52-a21cb012fe0c browser_files 1.0 2024-04-24 10:18:13.000000 From 08543d8d-2110-46f3-a9b6-ac58c4af8202 576 31 Released 1475 -8711 9d5325d8-49ba-4606-aaa2-c7269a19f5f7 variation_track 1.0 2024-04-24 10:18:13.000000 From 08543d8d-2110-46f3-a9b6-ac58c4af8202 576 32 Released 1475 -8712 d6d6f12a-c806-4db0-99dd-a667fbd7c191 evidence 1.0 2024-04-24 10:18:13.000000 From dd28ea84-6890-4af8-bd2b-2caa9917f221 608 4 Released 1507 -8713 8ed8f4b5-423e-44f1-a01d-bf2c19857374 short_variant 1.0 2024-04-24 10:18:13.000000 From dd28ea84-6890-4af8-bd2b-2caa9917f221 608 5 Released 1507 -8714 e17b4956-cf44-4ce3-bad6-c141fae5148c variation_ftp 1.0 2024-04-24 10:18:13.000000 From dd28ea84-6890-4af8-bd2b-2caa9917f221 608 29 Released 1507 -8715 48876d82-b51c-41d5-818f-5af04bcf8fc3 browser_files 1.0 2024-04-24 10:18:13.000000 From dd28ea84-6890-4af8-bd2b-2caa9917f221 608 31 Released 1507 -8716 c2dab1e0-5d72-4d50-9ec7-c8e90746ec65 variation_track 1.0 2024-04-24 10:18:13.000000 From dd28ea84-6890-4af8-bd2b-2caa9917f221 608 32 Released 1507 -8717 6d799ad1-5fe6-477f-8ddb-6a16ab3ea33a evidence 1.0 2024-04-24 10:18:13.000000 From b876cb36-6e84-4a2c-8af2-14e096d48df9 610 4 Released 1509 -8718 c06b0534-4708-4cec-913b-8b354fda0c6b short_variant 1.0 2024-04-24 10:18:13.000000 From b876cb36-6e84-4a2c-8af2-14e096d48df9 610 5 Released 1509 -8719 fff94973-5318-4821-9afa-3fd2fc0b7a4e variation_ftp 1.0 2024-04-24 10:18:13.000000 From b876cb36-6e84-4a2c-8af2-14e096d48df9 610 29 Released 1509 -8720 b5a2b993-1252-4495-bdf3-9eae0322cf39 browser_files 1.0 2024-04-24 10:18:13.000000 From b876cb36-6e84-4a2c-8af2-14e096d48df9 610 31 Released 1509 -8721 db33d965-62b4-4d83-9738-ade69df4cff5 variation_track 1.0 2024-04-24 10:18:13.000000 From b876cb36-6e84-4a2c-8af2-14e096d48df9 610 32 Released 1509 -8722 7780e084-b3ca-4df4-ba40-5a8bfea6e9a1 homology_compute 1.0 2024-04-24 10:18:13.000000 From 0bdc7428-6f81-4c96-a8d3-f5d5bff5a9b8 251 25 Released 2276 -8723 71fe75e0-50ca-4b74-a8a6-3d8d016e4227 homology_load 1.0 2024-04-24 10:18:13.000000 From 0bdc7428-6f81-4c96-a8d3-f5d5bff5a9b8 251 26 Released 2276 -8724 10879879-9a3a-4dfd-b0eb-c06699f0aada homology_ftp 1.0 2024-04-24 10:18:13.000000 From 0bdc7428-6f81-4c96-a8d3-f5d5bff5a9b8 251 27 Released 2276 -8725 2925c0ee-f987-4102-b792-4904c7b98d19 homology_compute 1.0 2024-04-24 10:18:13.000000 From 0571d77c-5cc6-4819-80bf-34a42acfc3f6 383 25 Released 2408 -8726 6f0b5633-abef-4daf-be84-489a979f8b0b homology_load 1.0 2024-04-24 10:18:13.000000 From 0571d77c-5cc6-4819-80bf-34a42acfc3f6 383 26 Released 2408 -8727 3e44f562-0cae-4165-9ef5-75fd6593d2e1 homology_ftp 1.0 2024-04-24 10:18:13.000000 From 0571d77c-5cc6-4819-80bf-34a42acfc3f6 383 27 Released 2408 -8728 80a73415-7eda-4c22-80a1-93508c1ebc03 homology_compute 1.0 2024-04-24 10:18:13.000000 From 832c1885-6b98-4bf0-b4cf-70e317399bd4 457 25 Released 2482 -8729 11185ed4-ea77-406e-bbbd-829601db2463 homology_load 1.0 2024-04-24 10:18:13.000000 From 832c1885-6b98-4bf0-b4cf-70e317399bd4 457 26 Released 2482 -8730 745f6eba-367b-4db9-92f7-7353603ce4ce homology_ftp 1.0 2024-04-24 10:18:13.000000 From 832c1885-6b98-4bf0-b4cf-70e317399bd4 457 27 Released 2482 -8731 1dcbaf3e-5179-434c-beae-1416149f30cf genebuild_compute ENS01 2024-04-24 10:18:13.000000 From 53936715-1371-4343-95af-f39d06943db7 7 8 Processed 14 -8732 d340ac5b-2f9b-44d7-bab8-99ff17516053 xrefs ENS01 2024-04-24 10:18:13.000000 From 1dcbaf3e-5179-434c-beae-1416149f30cf 7 13 Processed 8731 -8733 5a2fd22b-31ac-4e69-ad20-137b6d297cf8 protein_features ENS01 2024-04-24 10:18:13.000000 From 1dcbaf3e-5179-434c-beae-1416149f30cf 7 14 Processed 8731 -8734 7f988fe4-f4d5-4bd7-8516-2cfc767d7ec6 alpha_fold ENS01 2024-04-24 10:18:13.000000 From 1dcbaf3e-5179-434c-beae-1416149f30cf 7 15 Processed 8731 -8735 81bdc51c-4c4f-4e07-850b-562d0d964269 genebuild_files ENS01 2024-04-24 10:18:13.000000 From 53936715-1371-4343-95af-f39d06943db7 7 9 Processed 14 -8736 871842a4-566d-4b44-b883-caea737dbe70 blast ENS01 2024-04-24 10:18:13.000000 From 81bdc51c-4c4f-4e07-850b-562d0d964269 7 16 Processed 8735 -8737 5e7deba4-4293-4d6a-b954-73dfdc3be208 ftp_dumps ENS01 2024-04-24 10:18:13.000000 From 81bdc51c-4c4f-4e07-850b-562d0d964269 7 17 Processed 8735 -8738 2563efa7-e2be-401a-a89f-79ea71c17452 genebuild_web ENS01 2024-04-24 10:18:13.000000 From 53936715-1371-4343-95af-f39d06943db7 7 11 Processed 14 -8739 430221cd-df0d-4727-bd3f-8bdd1e69fb62 thoas_dumps ENS01 2024-04-24 10:18:13.000000 From 2563efa7-e2be-401a-a89f-79ea71c17452 7 18 Processed 8738 -8740 c918aa39-6dd4-4032-87da-5282e90c4142 browser_files ENS01 2024-04-24 10:18:13.000000 From 2563efa7-e2be-401a-a89f-79ea71c17452 7 20 Processed 8738 -8741 ce889e08-ab8b-4420-891b-9fb3ab5f4e81 checksums ENS01 2024-04-24 10:18:13.000000 From 2563efa7-e2be-401a-a89f-79ea71c17452 7 23 Processed 8738 -8742 96331294-fb65-457b-a3bd-5e41f1818044 genebuild_web ENS01 2024-04-24 10:18:13.000000 From 53936715-1371-4343-95af-f39d06943db7 7 12 Processed 14 -8743 a31eb1a1-1ebe-478e-b6a5-fd04ffbb0e3c thoas_load ENS01 2024-04-24 10:18:13.000000 From 96331294-fb65-457b-a3bd-5e41f1818044 7 19 Processed 8742 -8744 a1fae0e7-c124-4849-bfe5-e68c583b7826 genebuild_track ENS01 2024-04-24 10:18:13.000000 From 96331294-fb65-457b-a3bd-5e41f1818044 7 21 Processed 8742 -8745 145ad879-9c5e-4833-9645-ec0e9fb35079 refget_load ENS01 2024-04-24 10:18:13.000000 From 96331294-fb65-457b-a3bd-5e41f1818044 7 24 Processed 8742 -8746 8d55a4f8-0550-4770-aac2-c7963bfa1176 genebuild_compute ENS01 2024-04-24 10:18:13.000000 From 2bc8874e-6672-4293-89d6-0b837005177c 135 8 Processed 250 -8747 d92da251-954c-417b-8e89-03c677a60553 xrefs ENS01 2024-04-24 10:18:13.000000 From 8d55a4f8-0550-4770-aac2-c7963bfa1176 135 13 Processed 8746 -8748 819805e5-7aeb-437c-bb55-3918a7c94e48 protein_features ENS01 2024-04-24 10:18:13.000000 From 8d55a4f8-0550-4770-aac2-c7963bfa1176 135 14 Processed 8746 -8749 9c57cc1b-ee2b-4adf-968b-8b6cec556f95 alpha_fold ENS01 2024-04-24 10:18:13.000000 From 8d55a4f8-0550-4770-aac2-c7963bfa1176 135 15 Processed 8746 -8750 d4c1e848-8a50-4aae-95fe-5efb85833613 genebuild_files ENS01 2024-04-24 10:18:13.000000 From 2bc8874e-6672-4293-89d6-0b837005177c 135 9 Processed 250 -8751 24fbb4e6-45a2-4bb9-a1b7-2b14fafc5135 blast ENS01 2024-04-24 10:18:13.000000 From d4c1e848-8a50-4aae-95fe-5efb85833613 135 16 Processed 8750 -8752 32b8b190-ccb8-4d4e-8080-686fdb2bf853 ftp_dumps ENS01 2024-04-24 10:18:13.000000 From d4c1e848-8a50-4aae-95fe-5efb85833613 135 17 Processed 8750 -8753 ebd92c3c-1aaf-43c6-ae72-dd649e624ea2 genebuild_web ENS01 2024-04-24 10:18:13.000000 From 2bc8874e-6672-4293-89d6-0b837005177c 135 11 Processed 250 -8754 b41d8022-6a58-4a2b-a0fb-6776a722b7f0 thoas_dumps ENS01 2024-04-24 10:18:13.000000 From ebd92c3c-1aaf-43c6-ae72-dd649e624ea2 135 18 Processed 8753 -8755 9d4373b8-4248-4afe-ab14-d6c14c1b19ea browser_files ENS01 2024-04-24 10:18:13.000000 From ebd92c3c-1aaf-43c6-ae72-dd649e624ea2 135 20 Processed 8753 -8756 cc668846-b60c-4544-b151-84e77308595e checksums ENS01 2024-04-24 10:18:13.000000 From ebd92c3c-1aaf-43c6-ae72-dd649e624ea2 135 23 Processed 8753 -8757 7f200073-0c6e-4487-ac0b-f5cb160151f5 genebuild_web ENS01 2024-04-24 10:18:14.000000 From 2bc8874e-6672-4293-89d6-0b837005177c 135 12 Processed 250 -8758 4bedae91-0f82-478f-bba4-23f8dcb83ef0 thoas_load ENS01 2024-04-24 10:18:14.000000 From 7f200073-0c6e-4487-ac0b-f5cb160151f5 135 19 Processed 8757 -8759 8389b690-fd8d-40bf-8bb7-05254261be6b genebuild_track ENS01 2024-04-24 10:18:14.000000 From 7f200073-0c6e-4487-ac0b-f5cb160151f5 135 21 Processed 8757 -8760 e6f310ea-3a25-4adb-aa01-f514fe4d4183 refget_load ENS01 2024-04-24 10:18:14.000000 From 7f200073-0c6e-4487-ac0b-f5cb160151f5 135 24 Processed 8757 -8761 05125758-03b0-43c3-b4eb-973f05293e42 genebuild_compute ENS01 2024-04-24 10:18:14.000000 From 664088c7-356e-418c-adb2-15945b7ebc4b 6 8 Processed 892 -8762 055d833f-a5e3-4dc2-bd19-0827d05a576c xrefs ENS01 2024-04-24 10:18:14.000000 From 05125758-03b0-43c3-b4eb-973f05293e42 6 13 Processed 8761 -8763 966759bd-d77b-4f97-8502-307ba251adc8 protein_features ENS01 2024-04-24 10:18:14.000000 From 05125758-03b0-43c3-b4eb-973f05293e42 6 14 Processed 8761 -8764 154cbdc9-e1de-4d40-9e99-21a6d18cacaf alpha_fold ENS01 2024-04-24 10:18:14.000000 From 05125758-03b0-43c3-b4eb-973f05293e42 6 15 Processed 8761 -8765 9ab20e16-0d40-4145-8ad5-32e498b4cff4 genebuild_files ENS01 2024-04-24 10:18:14.000000 From 664088c7-356e-418c-adb2-15945b7ebc4b 6 9 Processed 892 -8766 0a975f98-3a5d-4270-9770-73cf4c48107b blast ENS01 2024-04-24 10:18:14.000000 From 9ab20e16-0d40-4145-8ad5-32e498b4cff4 6 16 Processed 8765 -8767 4823d7a3-b2c8-4220-8652-20436a20d9ca ftp_dumps ENS01 2024-04-24 10:18:14.000000 From 9ab20e16-0d40-4145-8ad5-32e498b4cff4 6 17 Processed 8765 -8768 a36bfaba-8751-403c-9024-ac00809cb748 genebuild_web ENS01 2024-04-24 10:18:14.000000 From 664088c7-356e-418c-adb2-15945b7ebc4b 6 11 Processed 892 -8769 d781cd94-54f7-403a-8a03-1114db2ccfe6 thoas_dumps ENS01 2024-04-24 10:18:14.000000 From a36bfaba-8751-403c-9024-ac00809cb748 6 18 Processed 8768 -8770 79d32d79-6346-4453-83d1-517ed275840b browser_files ENS01 2024-04-24 10:18:14.000000 From a36bfaba-8751-403c-9024-ac00809cb748 6 20 Processed 8768 -8771 cd5a8672-9177-4e08-8eb2-8a770ee58ce7 checksums ENS01 2024-04-24 10:18:14.000000 From a36bfaba-8751-403c-9024-ac00809cb748 6 23 Processed 8768 -8772 f6561cb1-4cae-47e7-ac63-ad2151f4b927 genebuild_web ENS01 2024-04-24 10:18:14.000000 From 664088c7-356e-418c-adb2-15945b7ebc4b 6 12 Processed 892 -8773 161453c1-3e3c-423e-a4f4-0f048d7c134c thoas_load ENS01 2024-04-24 10:18:14.000000 From f6561cb1-4cae-47e7-ac63-ad2151f4b927 6 19 Processed 8772 -8774 7f771283-0afa-4703-b534-3844646bc8e1 genebuild_track ENS01 2024-04-24 10:18:14.000000 From f6561cb1-4cae-47e7-ac63-ad2151f4b927 6 21 Processed 8772 -8775 00f2c284-5eea-43c0-a225-6bcc319a0b7f refget_load ENS01 2024-04-24 10:18:14.000000 From f6561cb1-4cae-47e7-ac63-ad2151f4b927 6 24 Processed 8772 -8776 da20e2b5-1809-494e-893f-7fb90e8032a1 genebuild_compute ENS01 2024-04-24 10:18:14.000000 From 66db32ae-974f-480c-a60b-63cc49d00f68 107 8 Processed 1006 -8777 8ec9f005-91d7-4015-be09-7b61b6d62c54 xrefs ENS01 2024-04-24 10:18:14.000000 From da20e2b5-1809-494e-893f-7fb90e8032a1 107 13 Processed 8776 -8778 fdd6e615-8ac7-41fc-b8b2-aff7aeb9c99a protein_features ENS01 2024-04-24 10:18:14.000000 From da20e2b5-1809-494e-893f-7fb90e8032a1 107 14 Processed 8776 -8779 f6d9a2a5-d744-4a90-a9b4-8656108bf921 alpha_fold ENS01 2024-04-24 10:18:14.000000 From da20e2b5-1809-494e-893f-7fb90e8032a1 107 15 Processed 8776 -8780 3f66717f-fcc1-4ea6-a6ae-1b038ec5f0cd genebuild_files ENS01 2024-04-24 10:18:14.000000 From 66db32ae-974f-480c-a60b-63cc49d00f68 107 9 Processed 1006 -8781 656bdb4a-c2f0-4ff1-93a8-6a780ba47e26 blast ENS01 2024-04-24 10:18:14.000000 From 3f66717f-fcc1-4ea6-a6ae-1b038ec5f0cd 107 16 Processed 8780 -8782 503fe667-0304-45db-ad36-860b9967290e ftp_dumps ENS01 2024-04-24 10:18:14.000000 From 3f66717f-fcc1-4ea6-a6ae-1b038ec5f0cd 107 17 Processed 8780 -8783 472c2bcc-3de5-446b-8b05-e33c3975acdb genebuild_web ENS01 2024-04-24 10:18:14.000000 From 66db32ae-974f-480c-a60b-63cc49d00f68 107 11 Processed 1006 -8784 83f43633-77ce-4164-8ec4-655707a4029d thoas_dumps ENS01 2024-04-24 10:18:14.000000 From 472c2bcc-3de5-446b-8b05-e33c3975acdb 107 18 Processed 8783 -8785 722acf20-f184-4ac4-b9ad-947de51b051e browser_files ENS01 2024-04-24 10:18:14.000000 From 472c2bcc-3de5-446b-8b05-e33c3975acdb 107 20 Processed 8783 -8786 8e15dcb4-cbd2-4971-a155-8d5956a38c41 checksums ENS01 2024-04-24 10:18:14.000000 From 472c2bcc-3de5-446b-8b05-e33c3975acdb 107 23 Processed 8783 -8787 aa4b8122-4480-4595-b2bf-c8c8f51537ce genebuild_web ENS01 2024-04-24 10:18:14.000000 From 66db32ae-974f-480c-a60b-63cc49d00f68 107 12 Processed 1006 -8788 4e7f51fc-b5f6-4aa3-ab31-b22a23d080b2 thoas_load ENS01 2024-04-24 10:18:14.000000 From aa4b8122-4480-4595-b2bf-c8c8f51537ce 107 19 Processed 8787 -8789 9770c787-1b91-4e65-bf76-5dc0d1c5c75f genebuild_track ENS01 2024-04-24 10:18:14.000000 From aa4b8122-4480-4595-b2bf-c8c8f51537ce 107 21 Processed 8787 -8790 0f140a16-de49-4566-a41d-8e1ccbc8f5d0 refget_load ENS01 2024-04-24 10:18:14.000000 From aa4b8122-4480-4595-b2bf-c8c8f51537ce 107 24 Processed 8787 -8791 e0a75f02-6ac6-4dfa-8196-50cb4803a9b8 evidence 1.0 2024-04-24 10:18:14.000000 From 5c2d6ef7-fe03-4f1a-bcc2-fb72af9ffa46 565 4 Processed 1464 -8792 04892d42-1689-4e76-a158-717f1c773a3d short_variant 1.0 2024-04-24 10:18:14.000000 From 5c2d6ef7-fe03-4f1a-bcc2-fb72af9ffa46 565 5 Processed 1464 -8793 4b9774b2-eabd-4981-b098-521b5b8a13a0 browser_files 1.0 2024-04-24 10:18:14.000000 From 5c2d6ef7-fe03-4f1a-bcc2-fb72af9ffa46 565 31 Processed 1464 -8794 f8c7383b-aaac-41cf-9ac8-dce5f99b5338 variation_ftp 1.0 2024-04-24 10:18:14.000000 From 5c2d6ef7-fe03-4f1a-bcc2-fb72af9ffa46 565 29 Processed 1464 -8795 3aeb9633-f9a6-4693-b51c-875935bb3e16 variation_track 1.0 2024-04-24 10:18:14.000000 From 5c2d6ef7-fe03-4f1a-bcc2-fb72af9ffa46 565 32 Processed 1464 -8796 628143db-4adb-4086-9168-1f7e875bbafe evidence 1.0 2024-04-24 10:18:14.000000 From 69f38cd5-5774-4b29-9ef4-181441ca0eb8 571 4 Processed 1470 -8797 0c150044-3236-4cbd-ba06-19e19d10000a short_variant 1.0 2024-04-24 10:18:14.000000 From 69f38cd5-5774-4b29-9ef4-181441ca0eb8 571 5 Processed 1470 -8798 e488f2c0-8926-4cd4-a3f4-5085885273ba browser_files 1.0 2024-04-24 10:18:14.000000 From 69f38cd5-5774-4b29-9ef4-181441ca0eb8 571 31 Processed 1470 -8799 0b1d6792-050b-461b-b6e7-8013f03caace variation_ftp 1.0 2024-04-24 10:18:14.000000 From 69f38cd5-5774-4b29-9ef4-181441ca0eb8 571 29 Processed 1470 -8800 b55eedc6-c8e9-46f9-8f3a-5487b590d563 variation_track 1.0 2024-04-24 10:18:14.000000 From 69f38cd5-5774-4b29-9ef4-181441ca0eb8 571 32 Processed 1470 -8801 4f615d55-8b10-4004-88c0-169e1016032c evidence 1.0 2024-04-24 10:18:14.000000 From a4c858c4-2e55-43c9-af8b-7adc7ce24c58 592 4 Processed 1491 -8802 9bdb03db-aed4-41af-be5c-7912d5cf82ad short_variant 1.0 2024-04-24 10:18:14.000000 From a4c858c4-2e55-43c9-af8b-7adc7ce24c58 592 5 Processed 1491 -8803 1bc869fb-7586-4394-8da3-40502ce06f28 browser_files 1.0 2024-04-24 10:18:14.000000 From a4c858c4-2e55-43c9-af8b-7adc7ce24c58 592 31 Processed 1491 -8804 0d05a7a3-6a4d-4f61-bcfe-76728865fe0a variation_ftp 1.0 2024-04-24 10:18:14.000000 From a4c858c4-2e55-43c9-af8b-7adc7ce24c58 592 29 Processed 1491 -8805 3f87e8e7-21f6-441c-83a4-a5c7204c5320 variation_track 1.0 2024-04-24 10:18:14.000000 From a4c858c4-2e55-43c9-af8b-7adc7ce24c58 592 32 Processed 1491 -8806 206879dd-6d27-451d-b5aa-8330696afc1a evidence 1.0 2024-04-24 10:18:14.000000 From 79e1a7b8-85dc-4bfe-b606-6992ca109ede 595 4 Processed 1494 -8807 1f82e0d6-2232-4dc7-9dba-c1c62b42c24f short_variant 1.0 2024-04-24 10:18:14.000000 From 79e1a7b8-85dc-4bfe-b606-6992ca109ede 595 5 Processed 1494 -8808 4c9ad5d4-3a35-40b7-ba30-a09b958a205b browser_files 1.0 2024-04-24 10:18:14.000000 From 79e1a7b8-85dc-4bfe-b606-6992ca109ede 595 31 Processed 1494 -8809 1c37a8bf-43d6-4fc8-98d8-b4e0d3a31931 variation_ftp 1.0 2024-04-24 10:18:14.000000 From 79e1a7b8-85dc-4bfe-b606-6992ca109ede 595 29 Processed 1494 -8810 460598d9-1266-4652-8ef8-536e8c1391fe variation_track 1.0 2024-04-24 10:18:14.000000 From 79e1a7b8-85dc-4bfe-b606-6992ca109ede 595 32 Processed 1494 -8811 6c28c1ae-af07-4199-9da9-b48ae0fcb7b7 homology_compute 1.0 2024-04-24 10:18:14.000000 From 35f56606-9186-432f-b033-6e7204708f3b 294 25 Processed 2319 -8812 4d9b87ea-6313-4aee-ad8b-ab2a2813e22a homology_load 1.0 2024-04-24 10:18:14.000000 From 35f56606-9186-432f-b033-6e7204708f3b 294 26 Processed 2319 -8813 9e3a4f8a-95f2-415c-88ec-5da6eb3e7c10 homology_ftp 1.0 2024-04-24 10:18:14.000000 From 35f56606-9186-432f-b033-6e7204708f3b 294 27 Processed 2319 -8814 cb8f93b0-b903-49c8-ad48-1cc4f4ceee6a homology_compute 1.0 2024-04-24 10:18:14.000000 From 8ebbce8e-dcc7-49f8-b520-4d479aef2a65 457 25 Processed 6593 -8815 9045fc64-a2f2-4cfa-b10e-8b55e6e631e2 homology_load 1.0 2024-04-24 10:18:14.000000 From 8ebbce8e-dcc7-49f8-b520-4d479aef2a65 457 26 Processed 6593 -8816 b9c88135-824b-425b-ab1b-156cd58a0bde homology_ftp 1.0 2024-04-24 10:18:15.000000 From 8ebbce8e-dcc7-49f8-b520-4d479aef2a65 457 27 Processed 6593 -8817 fd821999-6a13-407c-b6ae-bda323fc1795 homology_compute 1.0 2024-04-24 10:18:15.000000 From 60fae37a-afe2-4bb1-9da0-321911dd2856 266 25 Processed 6623 -8818 f9634579-f88a-4892-b2c1-1762da95b69e homology_load 1.0 2024-04-24 10:18:15.000000 From 60fae37a-afe2-4bb1-9da0-321911dd2856 266 26 Processed 6623 -8819 5bce46a4-6eea-4b4d-8843-681ed932e251 homology_ftp 1.0 2024-04-24 10:18:15.000000 From 60fae37a-afe2-4bb1-9da0-321911dd2856 266 27 Processed 6623 -8820 1e3c7c73-8748-4b80-afc5-37a4045e0f71 homology_compute 1.0 2024-04-24 10:18:15.000000 From 06debe4a-2b3e-4fab-991e-26863dbe8af8 323 25 Processed 6699 -8821 0c048acb-d4e2-441f-8d8c-86b83e9bf23d homology_load 1.0 2024-04-24 10:18:15.000000 From 06debe4a-2b3e-4fab-991e-26863dbe8af8 323 26 Processed 6699 -8822 021b89f1-0f7f-4478-97aa-0accd0a7606e homology_ftp 1.0 2024-04-24 10:18:15.000000 From 06debe4a-2b3e-4fab-991e-26863dbe8af8 323 27 Processed 6699 -8823 c929d7cd-aa29-4605-863d-60072b9eccc8 homology_compute 1.0 2024-04-24 10:18:15.000000 From f3abf167-6a8f-45cc-b753-22a955123758 387 25 Processed 6896 -8824 ecdd3f2d-74ba-4a2b-96e9-c43bce42e266 homology_load 1.0 2024-04-24 10:18:15.000000 From f3abf167-6a8f-45cc-b753-22a955123758 387 26 Processed 6896 -8825 f7834e60-f18a-4ec6-b8c3-def92135f691 homology_ftp 1.0 2024-04-24 10:18:15.000000 From f3abf167-6a8f-45cc-b753-22a955123758 387 27 Processed 6896 -8826 2f3e7f30-2c95-4bcf-a02f-3f8819ab6562 homology_compute 1.0 2024-04-24 10:18:15.000000 From 9681f4c2-afb4-4a08-8e4d-f26363f65ddf 369 25 Processed 7069 -8827 afdc661a-4e30-411f-b453-14ecf1973672 homology_load 1.0 2024-04-24 10:18:15.000000 From 9681f4c2-afb4-4a08-8e4d-f26363f65ddf 369 26 Processed 7069 -8828 f9fa5df3-e813-4667-aea2-2df4d67ffdb4 homology_ftp 1.0 2024-04-24 10:18:15.000000 From 9681f4c2-afb4-4a08-8e4d-f26363f65ddf 369 27 Processed 7069 -8829 85b782ae-d1a6-4bee-8a48-3dae3c9da7aa homology_compute 1.0 2024-04-24 10:18:15.000000 From d57040b6-0ef5-4e6b-97ef-be0ad94d3a61 424 25 Processed 7177 -8830 3b89fcc0-1809-4de6-ae4e-e82bf34c26bf homology_load 1.0 2024-04-24 10:18:15.000000 From d57040b6-0ef5-4e6b-97ef-be0ad94d3a61 424 26 Processed 7177 -8831 e1a9a2ed-318a-4e57-9136-ecb33082d71c homology_ftp 1.0 2024-04-24 10:18:15.000000 From d57040b6-0ef5-4e6b-97ef-be0ad94d3a61 424 27 Processed 7177 -8832 003c2932-ba1d-47b8-8f32-88b27bd79d87 homology_compute 1.0 2024-04-24 10:18:15.000000 From c4f0eec5-f4b3-4cef-8369-7b13932509e1 284 25 Processed 7320 -8833 9fa37967-8ace-4f6d-925b-dce75a70ab79 homology_load 1.0 2024-04-24 10:18:15.000000 From c4f0eec5-f4b3-4cef-8369-7b13932509e1 284 26 Processed 7320 -8834 94360425-5f48-4bc9-8863-2011b3e8115d homology_ftp 1.0 2024-04-24 10:18:15.000000 From c4f0eec5-f4b3-4cef-8369-7b13932509e1 284 27 Processed 7320 -8835 9c874e53-bf5c-4c49-8a31-123c1025a7a6 homology_compute 1.0 2024-04-24 10:18:15.000000 From d51ab85a-f037-47a3-ba53-423ad8e42669 383 25 Processed 7535 -8836 ff6239d9-bd90-49f9-b3a5-0cee348267b7 homology_load 1.0 2024-04-24 10:18:15.000000 From d51ab85a-f037-47a3-ba53-423ad8e42669 383 26 Processed 7535 -8837 b7b5b776-30f0-469f-bd19-707ba2fa8f3c homology_ftp 1.0 2024-04-24 10:18:15.000000 From d51ab85a-f037-47a3-ba53-423ad8e42669 383 27 Processed 7535 -8838 18783f74-ffd9-4e46-90bd-f18d7ca00896 homology_compute 1.0 2024-04-24 10:18:15.000000 From 254a68c7-f512-446d-a958-983a2713daf2 359 25 Processed 7603 -8839 e2ea6dca-fd18-4cc5-8120-d724bea5a5bd homology_load 1.0 2024-04-24 10:18:15.000000 From 254a68c7-f512-446d-a958-983a2713daf2 359 26 Processed 7603 -8840 86288d80-6d0e-4da8-9c3c-a2a78b740f63 homology_ftp 1.0 2024-04-24 10:18:15.000000 From 254a68c7-f512-446d-a958-983a2713daf2 359 27 Processed 7603 -8841 c40f1ff1-2da6-4569-9c95-dae534bdb59c homology_compute 1.0 2024-04-24 10:18:15.000000 From 9f2a7c92-e04a-443f-a991-1481a9466456 313 25 Processed 7785 -8842 9e45ec23-6f77-40b3-b487-717981b6789a homology_load 1.0 2024-04-24 10:18:15.000000 From 9f2a7c92-e04a-443f-a991-1481a9466456 313 26 Processed 7785 -8843 ea7e6915-8d77-4eee-bb3c-35e93a41e7e9 homology_ftp 1.0 2024-04-24 10:18:15.000000 From 9f2a7c92-e04a-443f-a991-1481a9466456 313 27 Processed 7785 -8844 3c7d8319-a57c-447b-b47f-5a1873a0b018 homology_compute 1.0 2024-04-24 10:18:15.000000 From 681ceca3-3336-4b92-ac11-43b5fcabec62 251 25 Processed 7820 -8845 b9dac26c-ecc2-4ec4-a8ac-d240cd0b748e homology_load 1.0 2024-04-24 10:18:15.000000 From 681ceca3-3336-4b92-ac11-43b5fcabec62 251 26 Processed 7820 -8846 c2772f23-1c2f-4e55-8453-b6985475a629 homology_ftp 1.0 2024-04-24 10:18:15.000000 From 681ceca3-3336-4b92-ac11-43b5fcabec62 251 27 Processed 7820 -8847 7e892555-9d54-479e-9d2c-1fab7e45c03f homology_compute 1.0 2024-04-24 10:18:15.000000 From d78259af-f491-42de-9cbf-de744b09efee 332 25 Processed 7847 -8848 3e2a34c6-cd2b-4e1f-bac4-9acf67254bf8 homology_load 1.0 2024-04-24 10:18:15.000000 From d78259af-f491-42de-9cbf-de744b09efee 332 26 Processed 7847 -8849 4b97ed95-560c-4a4c-95c1-44b654ad0fa0 homology_ftp 1.0 2024-04-24 10:18:15.000000 From d78259af-f491-42de-9cbf-de744b09efee 332 27 Processed 7847 -8850 10e0efaa-e444-4e8d-927e-a761aa30969f homology_compute 1.0 2024-04-24 10:18:15.000000 From b6472939-9e49-4d46-b93e-304910acabf3 4352 25 Processed 8392 -8851 369f5f6b-763f-44ea-8f13-862c06461346 homology_load 1.0 2024-04-24 10:18:15.000000 From b6472939-9e49-4d46-b93e-304910acabf3 4352 26 Processed 8392 -8852 f04c6b1b-4458-4d28-aa47-b43ea12b08c9 homology_ftp 1.0 2024-04-24 10:18:15.000000 From b6472939-9e49-4d46-b93e-304910acabf3 4352 27 Processed 8392 -8853 9ca4293d-6710-4903-9fc1-8417265ce27d genebuild_compute ENS01 2024-04-24 10:18:15.000000 From ed8ca387-38e3-4bfe-8b85-757a59b95126 97 8 Processed 178 -8854 570f2d24-a2c9-4bb2-895f-8653c0def936 xrefs ENS01 2024-04-24 10:18:15.000000 From 9ca4293d-6710-4903-9fc1-8417265ce27d 97 13 Processed 8853 -8855 0f80b03b-1145-4cdd-bf93-9df96b65f160 protein_features ENS01 2024-04-24 10:18:15.000000 From 9ca4293d-6710-4903-9fc1-8417265ce27d 97 14 Processed 8853 -8856 11584801-c419-4acf-bb1f-119303afc0d4 alpha_fold ENS01 2024-04-24 10:18:15.000000 From 9ca4293d-6710-4903-9fc1-8417265ce27d 97 15 Processed 8853 -8857 8ab46307-5632-4b68-9815-83997bf53b94 genebuild_files ENS01 2024-04-24 10:18:15.000000 From ed8ca387-38e3-4bfe-8b85-757a59b95126 97 9 Processed 178 -8858 1848bff4-5285-4c3a-abcc-62869d41dd87 blast ENS01 2024-04-24 10:18:15.000000 From 8ab46307-5632-4b68-9815-83997bf53b94 97 16 Processed 8857 -8859 1861572a-1bde-4720-bbfa-951b9d8a4456 ftp_dumps ENS01 2024-04-24 10:18:15.000000 From 8ab46307-5632-4b68-9815-83997bf53b94 97 17 Processed 8857 -8860 2109cb44-362b-4571-94da-67859d194824 genebuild_web ENS01 2024-04-24 10:18:15.000000 From ed8ca387-38e3-4bfe-8b85-757a59b95126 97 11 Processed 178 -8861 2f09c74f-c2cc-4fea-8d86-6e7461b3366d thoas_dumps ENS01 2024-04-24 10:18:15.000000 From 2109cb44-362b-4571-94da-67859d194824 97 18 Processed 8860 -8862 fdca5cd5-9a2f-4c94-bc52-fcdda4070e4e browser_files ENS01 2024-04-24 10:18:15.000000 From 2109cb44-362b-4571-94da-67859d194824 97 20 Processed 8860 -8863 a2a1216d-2cad-41d8-b115-a6711989abd5 checksums ENS01 2024-04-24 10:18:15.000000 From 2109cb44-362b-4571-94da-67859d194824 97 23 Processed 8860 -8864 7ed8ac66-3af1-419f-ab9c-08d2a40887a2 genebuild_compute ENS01 2024-04-24 10:18:16.000000 From f9690d7e-26c1-459d-8102-0c4a1a468806 9 8 Processed 896 -8865 f2e96294-16ce-4575-af45-f9a2c46383d7 xrefs ENS01 2024-04-24 10:18:16.000000 From 7ed8ac66-3af1-419f-ab9c-08d2a40887a2 9 13 Processed 8864 -8866 165a40b7-2eee-4ba4-b4dc-8b6ad2402004 protein_features ENS01 2024-04-24 10:18:16.000000 From 7ed8ac66-3af1-419f-ab9c-08d2a40887a2 9 14 Processed 8864 -8867 d3aee3e0-6b4b-43c9-8c52-19a18f91f824 alpha_fold ENS01 2024-04-24 10:18:16.000000 From 7ed8ac66-3af1-419f-ab9c-08d2a40887a2 9 15 Processed 8864 -8868 b66be831-e87b-4bd5-ba89-fc283cc50193 genebuild_files ENS01 2024-04-24 10:18:16.000000 From f9690d7e-26c1-459d-8102-0c4a1a468806 9 9 Processed 896 -8869 79505983-6bf5-4828-8240-4d51a36a3171 blast ENS01 2024-04-24 10:18:16.000000 From b66be831-e87b-4bd5-ba89-fc283cc50193 9 16 Processed 8868 -8870 2b747467-a74c-488d-a900-3f6385c1f26a ftp_dumps ENS01 2024-04-24 10:18:16.000000 From b66be831-e87b-4bd5-ba89-fc283cc50193 9 17 Processed 8868 -8871 b42fdfd2-7bca-4cd1-bee4-46287457ee93 genebuild_web ENS01 2024-04-24 10:18:16.000000 From f9690d7e-26c1-459d-8102-0c4a1a468806 9 11 Processed 896 -8872 1568bc5d-9fba-42df-87b3-52f77a8552b7 thoas_dumps ENS01 2024-04-24 10:18:16.000000 From b42fdfd2-7bca-4cd1-bee4-46287457ee93 9 18 Processed 8871 -8873 cff11c40-12e8-469d-945f-c1c6ffb852f5 browser_files ENS01 2024-04-24 10:18:16.000000 From b42fdfd2-7bca-4cd1-bee4-46287457ee93 9 20 Processed 8871 -8874 11e13c3e-5773-4f30-86c1-6c251f9c4c70 checksums ENS01 2024-04-24 10:18:16.000000 From b42fdfd2-7bca-4cd1-bee4-46287457ee93 9 23 Processed 8871 -8875 e79f65ba-08a5-4aca-b3b9-08ff7c36ba70 evidence 1.0 2024-04-24 10:18:16.000000 From 0c1cdfea-cb10-4a08-8f75-4158658d6a02 597 4 Processed 1496 -8876 7d15f5de-2e98-44b0-ba74-9c70bfd450c6 short_variant 1.0 2024-04-24 10:18:16.000000 From 0c1cdfea-cb10-4a08-8f75-4158658d6a02 597 5 Processed 1496 -8877 ea922b02-b8fe-4f7c-ac4d-a133acc5f532 browser_files 1.0 2024-04-24 10:18:16.000000 From 0c1cdfea-cb10-4a08-8f75-4158658d6a02 597 31 Processed 1496 -8878 7a89be81-4103-4eb8-98b2-23e96e0c4f76 genebuild_compute EXT01 2024-04-24 10:18:16.000000 From cc3c7f95-b5dc-4cc1-aa15-2817c89bd1e2 1 8 Released 2 -8879 20c06eab-391d-4b06-943c-0754f0fef146 xrefs EXT01 2024-04-24 10:18:16.000000 From 7a89be81-4103-4eb8-98b2-23e96e0c4f76 1 13 Released 8878 -8880 a29813e0-c950-40fc-b970-a360a10a15b6 protein_features EXT01 2024-04-24 10:18:16.000000 From 7a89be81-4103-4eb8-98b2-23e96e0c4f76 1 14 Released 8878 -8881 5c0b59d2-faf4-4297-b10d-c304e1f55998 alpha_fold EXT01 2024-04-24 10:18:16.000000 From 7a89be81-4103-4eb8-98b2-23e96e0c4f76 1 15 Released 8878 -8882 8d33dbd0-93d9-4279-bdfe-21f756afc898 genebuild_files EXT01 2024-04-24 10:18:16.000000 From cc3c7f95-b5dc-4cc1-aa15-2817c89bd1e2 1 9 Released 2 -8883 7e0ec09a-0ed9-461f-abb4-bb8de9f9b842 blast EXT01 2024-04-24 10:18:16.000000 From 8d33dbd0-93d9-4279-bdfe-21f756afc898 1 16 Released 8882 -8884 551593dc-42ad-45ec-8311-c052330feaac ftp_dumps EXT01 2024-04-24 10:18:16.000000 From 8d33dbd0-93d9-4279-bdfe-21f756afc898 1 17 Released 8882 -8885 7b5a6b07-d345-479f-95a0-c9a6712eb747 genebuild_web EXT01 2024-04-24 10:18:16.000000 From cc3c7f95-b5dc-4cc1-aa15-2817c89bd1e2 1 11 Released 2 -8886 711ea653-09ba-47da-b6ff-585c25548546 thoas_dumps EXT01 2024-04-24 10:18:16.000000 From 7b5a6b07-d345-479f-95a0-c9a6712eb747 1 18 Released 8885 -8887 1f7df867-7a26-4b59-98cb-866e63a215f9 browser_files EXT01 2024-04-24 10:18:16.000000 From 7b5a6b07-d345-479f-95a0-c9a6712eb747 1 20 Released 8885 -8888 34d43bd3-3061-43ac-b0ab-7eb4d7edd3b3 checksums EXT01 2024-04-24 10:18:16.000000 From 7b5a6b07-d345-479f-95a0-c9a6712eb747 1 23 Released 8885 -8889 fd8f0c4f-59c6-4d40-8dc9-1784f312b935 genebuild_web EXT01 2024-04-24 10:18:16.000000 From cc3c7f95-b5dc-4cc1-aa15-2817c89bd1e2 1 12 Released 2 -8890 2b3158ad-aedc-464d-bad1-7dc448a1623a thoas_load EXT01 2024-04-24 10:18:16.000000 From fd8f0c4f-59c6-4d40-8dc9-1784f312b935 1 19 Released 8889 -8891 62b23f57-bc61-4ade-aa1d-77ecf7f8b18c genebuild_track EXT01 2024-04-24 10:18:16.000000 From fd8f0c4f-59c6-4d40-8dc9-1784f312b935 1 21 Released 8889 -8892 620df1db-d908-4406-9f1d-e97b11c798c7 refget_load EXT01 2024-04-24 10:18:16.000000 From fd8f0c4f-59c6-4d40-8dc9-1784f312b935 1 24 Released 8889 -8893 35485780-095a-44ad-a85b-ed37aff1f5ac genebuild_compute EXT01 2024-04-24 10:18:16.000000 From f202cd36-d0dc-40df-9dd6-a8218e0d1366 14 8 Released 24 -8894 7760299d-bc48-4424-82ba-d069153212a1 xrefs EXT01 2024-04-24 10:18:16.000000 From 35485780-095a-44ad-a85b-ed37aff1f5ac 14 13 Released 8893 -8895 95b0c4a8-15fe-4d7e-b07a-3ebc08a95ddf protein_features EXT01 2024-04-24 10:18:16.000000 From 35485780-095a-44ad-a85b-ed37aff1f5ac 14 14 Released 8893 -8896 256fe2d6-3e6f-4c89-9d46-1616de1bac53 alpha_fold EXT01 2024-04-24 10:18:16.000000 From 35485780-095a-44ad-a85b-ed37aff1f5ac 14 15 Released 8893 -8897 273822b7-89dd-4eef-a0d7-c6aae2322939 genebuild_files EXT01 2024-04-24 10:18:16.000000 From f202cd36-d0dc-40df-9dd6-a8218e0d1366 14 9 Released 24 -8898 c090e67c-df71-401f-a90c-1fca8aee34fe blast EXT01 2024-04-24 10:18:16.000000 From 273822b7-89dd-4eef-a0d7-c6aae2322939 14 16 Released 8897 -8899 c51e1970-4e48-41cc-8955-be9172cf5f23 ftp_dumps EXT01 2024-04-24 10:18:16.000000 From 273822b7-89dd-4eef-a0d7-c6aae2322939 14 17 Released 8897 -8900 1dfc15fc-d7d2-499b-beb4-e6ce0f06f743 genebuild_web EXT01 2024-04-24 10:18:16.000000 From f202cd36-d0dc-40df-9dd6-a8218e0d1366 14 11 Released 24 -8901 d5e0ebee-7117-494d-bc1b-6d2c70d9491c thoas_dumps EXT01 2024-04-24 10:18:16.000000 From 1dfc15fc-d7d2-499b-beb4-e6ce0f06f743 14 18 Released 8900 -8902 6f8f8f0b-e755-45cc-97eb-d9c182e873b0 browser_files EXT01 2024-04-24 10:18:16.000000 From 1dfc15fc-d7d2-499b-beb4-e6ce0f06f743 14 20 Released 8900 -8903 ff6ccd65-e644-42e9-8f38-04a8f253bef9 checksums EXT01 2024-04-24 10:18:16.000000 From 1dfc15fc-d7d2-499b-beb4-e6ce0f06f743 14 23 Released 8900 -8904 8945cfec-17e5-48af-83f4-79907740fddd genebuild_web EXT01 2024-04-24 10:18:16.000000 From f202cd36-d0dc-40df-9dd6-a8218e0d1366 14 12 Released 24 -8905 2bdb1dcf-b45b-4d91-82ba-d1c9137ae7b9 thoas_load EXT01 2024-04-24 10:18:16.000000 From 8945cfec-17e5-48af-83f4-79907740fddd 14 19 Released 8904 -8906 a88f003b-d5bf-479f-aeeb-4696a3de728a genebuild_track EXT01 2024-04-24 10:18:16.000000 From 8945cfec-17e5-48af-83f4-79907740fddd 14 21 Released 8904 -8907 fce24263-d668-4daa-9eac-27abdebf3a90 refget_load EXT01 2024-04-24 10:18:16.000000 From 8945cfec-17e5-48af-83f4-79907740fddd 14 24 Released 8904 -8908 07f1ea6a-bc29-4426-ab28-db9e8df67135 genebuild_compute GENCODE19 2024-04-24 10:18:16.000000 From e456d1c2-eea2-40f1-83ee-31912905b695 40 8 Released 62 -8909 50c5df76-ad83-4cdd-822b-59a0a5a5caaa xrefs GENCODE19 2024-04-24 10:18:16.000000 From 07f1ea6a-bc29-4426-ab28-db9e8df67135 40 13 Released 8908 -8910 f45160df-1af8-4bb4-b52b-099c9f5ce005 protein_features GENCODE19 2024-04-24 10:18:16.000000 From 07f1ea6a-bc29-4426-ab28-db9e8df67135 40 14 Released 8908 -8911 9bb457f3-a4c6-43ea-a5d8-df8193e33e2b alpha_fold GENCODE19 2024-04-24 10:18:16.000000 From 07f1ea6a-bc29-4426-ab28-db9e8df67135 40 15 Released 8908 -8912 1cd2a36c-9459-4aae-add4-594bdf9570ae genebuild_files GENCODE19 2024-04-24 10:18:16.000000 From e456d1c2-eea2-40f1-83ee-31912905b695 40 9 Released 62 -8913 f3ac13c6-0943-45e5-b553-6e6ecd5febb0 blast GENCODE19 2024-04-24 10:18:16.000000 From 1cd2a36c-9459-4aae-add4-594bdf9570ae 40 16 Released 8912 -8914 f817d89d-e6c7-474f-b585-a816c9a19926 ftp_dumps GENCODE19 2024-04-24 10:18:16.000000 From 1cd2a36c-9459-4aae-add4-594bdf9570ae 40 17 Released 8912 -8915 6af4ed97-8fde-41f8-9e28-30e4f2ff2d62 genebuild_web GENCODE19 2024-04-24 10:18:16.000000 From e456d1c2-eea2-40f1-83ee-31912905b695 40 11 Released 62 -8916 e055d56a-3878-4ccb-ac8b-56748d103fbd thoas_dumps GENCODE19 2024-04-24 10:18:16.000000 From 6af4ed97-8fde-41f8-9e28-30e4f2ff2d62 40 18 Released 8915 -8917 9c4831c4-8ee3-4058-8325-54a1d642e0a5 browser_files GENCODE19 2024-04-24 10:18:16.000000 From 6af4ed97-8fde-41f8-9e28-30e4f2ff2d62 40 20 Released 8915 -8918 8aa89c54-db39-4f69-9c23-78a2f7077548 checksums GENCODE19 2024-04-24 10:18:16.000000 From 6af4ed97-8fde-41f8-9e28-30e4f2ff2d62 40 23 Released 8915 -8919 c0993f4c-eda0-40e3-807d-ad7ea361e285 genebuild_web GENCODE19 2024-04-24 10:18:16.000000 From e456d1c2-eea2-40f1-83ee-31912905b695 40 12 Released 62 -8920 1bf39fbc-6863-4c23-8960-975a027556e6 thoas_load GENCODE19 2024-04-24 10:18:16.000000 From c0993f4c-eda0-40e3-807d-ad7ea361e285 40 19 Released 8919 -8921 3c1f9d35-8013-40ff-98b5-b62c764f284f genebuild_track GENCODE19 2024-04-24 10:18:16.000000 From c0993f4c-eda0-40e3-807d-ad7ea361e285 40 21 Released 8919 -8922 36d517a8-f972-4350-a44c-88d04d44286f refget_load GENCODE19 2024-04-24 10:18:16.000000 From c0993f4c-eda0-40e3-807d-ad7ea361e285 40 24 Released 8919 -8923 9687952e-9dc8-4240-aece-a968dadbe909 genebuild_compute EXT01 2024-04-24 10:18:17.000000 From 287a5483-55a4-46e6-a58b-a84ba0ddacd6 79 8 Released 148 -8924 3ed69d61-78ad-4cd0-9244-8337c6896de7 xrefs EXT01 2024-04-24 10:18:17.000000 From 9687952e-9dc8-4240-aece-a968dadbe909 79 13 Released 8923 -8925 90cbc64d-d88f-4bdf-ac6c-99e0837f4253 protein_features EXT01 2024-04-24 10:18:17.000000 From 9687952e-9dc8-4240-aece-a968dadbe909 79 14 Released 8923 -8926 a70bc6ec-1678-487e-89a9-c39b338bc624 alpha_fold EXT01 2024-04-24 10:18:17.000000 From 9687952e-9dc8-4240-aece-a968dadbe909 79 15 Released 8923 -8927 704e4912-c270-4647-99bb-e8789d092949 genebuild_files EXT01 2024-04-24 10:18:17.000000 From 287a5483-55a4-46e6-a58b-a84ba0ddacd6 79 9 Released 148 -8928 b90ce8e9-0ddd-4591-b118-ecce2ab04b37 blast EXT01 2024-04-24 10:18:17.000000 From 704e4912-c270-4647-99bb-e8789d092949 79 16 Released 8927 -8929 935a9cee-4712-47ea-8f56-5aacf09b3883 ftp_dumps EXT01 2024-04-24 10:18:17.000000 From 704e4912-c270-4647-99bb-e8789d092949 79 17 Released 8927 -8930 26fe8cb4-05b4-4f6c-83d9-a4b34c32ca08 genebuild_web EXT01 2024-04-24 10:18:17.000000 From 287a5483-55a4-46e6-a58b-a84ba0ddacd6 79 11 Released 148 -8931 b4f77b8b-00e0-4977-9c75-5c2d65a07f0d thoas_dumps EXT01 2024-04-24 10:18:17.000000 From 26fe8cb4-05b4-4f6c-83d9-a4b34c32ca08 79 18 Released 8930 -8932 c82b305f-f783-40ab-af1a-8c2375e8816a browser_files EXT01 2024-04-24 10:18:17.000000 From 26fe8cb4-05b4-4f6c-83d9-a4b34c32ca08 79 20 Released 8930 -8933 fcd75ab2-a057-4d7d-ad37-b7604a7d0764 checksums EXT01 2024-04-24 10:18:17.000000 From 26fe8cb4-05b4-4f6c-83d9-a4b34c32ca08 79 23 Released 8930 -8934 a81f1f36-c2e4-498e-ba53-8c8a28759a4c genebuild_web EXT01 2024-04-24 10:18:17.000000 From 287a5483-55a4-46e6-a58b-a84ba0ddacd6 79 12 Released 148 -8935 56fcabc6-a26f-4007-aef7-28440b60a990 thoas_load EXT01 2024-04-24 10:18:17.000000 From a81f1f36-c2e4-498e-ba53-8c8a28759a4c 79 19 Released 8934 -8936 f9ef4142-f4c9-4def-84af-c9480934d408 genebuild_track EXT01 2024-04-24 10:18:17.000000 From a81f1f36-c2e4-498e-ba53-8c8a28759a4c 79 21 Released 8934 -8937 329e59f5-ff09-4e8f-8398-c5bb37d25a16 refget_load EXT01 2024-04-24 10:18:17.000000 From a81f1f36-c2e4-498e-ba53-8c8a28759a4c 79 24 Released 8934 -8938 d732f47d-4783-4cf3-80ee-566347f27fe5 genebuild_compute GENCODE44 2024-04-24 10:18:17.000000 From 949defef-c4d2-4ab1-8a73-f41d2b3c7719 92 8 Released 172 -8939 f0e56fd2-53a7-4a5d-8c6e-a0e425659e2d xrefs GENCODE44 2024-04-24 10:18:17.000000 From d732f47d-4783-4cf3-80ee-566347f27fe5 92 13 Released 8938 -8940 9e50213a-a3dc-4d86-bff2-2c607ea97be6 protein_features GENCODE44 2024-04-24 10:18:17.000000 From d732f47d-4783-4cf3-80ee-566347f27fe5 92 14 Released 8938 -8941 c683f337-fe1a-4080-8fa6-b2f5921c82f5 alpha_fold GENCODE44 2024-04-24 10:18:17.000000 From d732f47d-4783-4cf3-80ee-566347f27fe5 92 15 Released 8938 -8942 a26a6832-2081-4f10-bdec-9034f9682f88 genebuild_files GENCODE44 2024-04-24 10:18:17.000000 From 949defef-c4d2-4ab1-8a73-f41d2b3c7719 92 9 Released 172 -8943 1bbc28e3-55b6-42aa-a806-231f4d645d79 blast GENCODE44 2024-04-24 10:18:17.000000 From a26a6832-2081-4f10-bdec-9034f9682f88 92 16 Released 8942 -8944 6353961a-be42-43f6-be0e-98dff9a0e6fa ftp_dumps GENCODE44 2024-04-24 10:18:17.000000 From a26a6832-2081-4f10-bdec-9034f9682f88 92 17 Released 8942 -8945 698fc95c-9f61-4a92-8ad8-8dd3f1ec3c77 genebuild_web GENCODE44 2024-04-24 10:18:17.000000 From 949defef-c4d2-4ab1-8a73-f41d2b3c7719 92 11 Released 172 -8946 4f8b970e-054f-48a2-82ee-f638d42cb88c thoas_dumps GENCODE44 2024-04-24 10:18:17.000000 From 698fc95c-9f61-4a92-8ad8-8dd3f1ec3c77 92 18 Released 8945 -8947 893ab5b4-cf72-4ac8-93a9-6b263ef24fec browser_files GENCODE44 2024-04-24 10:18:17.000000 From 698fc95c-9f61-4a92-8ad8-8dd3f1ec3c77 92 20 Released 8945 -8948 0d2e1d80-ec5d-4c83-a777-d0e12dd3a20d checksums GENCODE44 2024-04-24 10:18:17.000000 From 698fc95c-9f61-4a92-8ad8-8dd3f1ec3c77 92 23 Released 8945 -8949 d0c2c132-d6e9-4cc6-b7ea-90aae49fdd97 genebuild_web GENCODE44 2024-04-24 10:18:17.000000 From 949defef-c4d2-4ab1-8a73-f41d2b3c7719 92 12 Released 172 -8950 7cc832cd-2a22-4326-95ec-3b440ad218d0 thoas_load GENCODE44 2024-04-24 10:18:17.000000 From d0c2c132-d6e9-4cc6-b7ea-90aae49fdd97 92 19 Released 8949 -8951 743e6c8d-5fe5-4417-9673-d807b4b494e8 genebuild_track GENCODE44 2024-04-24 10:18:17.000000 From d0c2c132-d6e9-4cc6-b7ea-90aae49fdd97 92 21 Released 8949 -8952 775b793f-124c-4b97-8734-cf38aa2e66d4 refget_load GENCODE44 2024-04-24 10:18:17.000000 From d0c2c132-d6e9-4cc6-b7ea-90aae49fdd97 92 24 Released 8949 -8953 6c9c5c7a-b58a-4fd6-92a2-8c1d2c8e155c genebuild_compute EXT01 2024-04-24 10:18:17.000000 From cfef61f8-7e24-4ed6-945f-baca1b2664a3 214 8 Released 402 -8954 7e303177-8107-4467-affd-60fcb3cb9fe9 xrefs EXT01 2024-04-24 10:18:17.000000 From 6c9c5c7a-b58a-4fd6-92a2-8c1d2c8e155c 214 13 Released 8953 -8955 5ea5d208-db3d-409a-ad3b-90fc25357975 protein_features EXT01 2024-04-24 10:18:17.000000 From 6c9c5c7a-b58a-4fd6-92a2-8c1d2c8e155c 214 14 Released 8953 -8956 bde17d27-1505-4d55-81e1-589b2c160758 alpha_fold EXT01 2024-04-24 10:18:17.000000 From 6c9c5c7a-b58a-4fd6-92a2-8c1d2c8e155c 214 15 Released 8953 -8957 a312a91d-c738-44d2-9117-3289689074bf genebuild_files EXT01 2024-04-24 10:18:17.000000 From cfef61f8-7e24-4ed6-945f-baca1b2664a3 214 9 Released 402 -8958 cabdf704-d7a7-43c6-91b1-ef13643fe743 blast EXT01 2024-04-24 10:18:17.000000 From a312a91d-c738-44d2-9117-3289689074bf 214 16 Released 8957 -8959 35242e52-ac0a-4232-a04e-602712188981 ftp_dumps EXT01 2024-04-24 10:18:17.000000 From a312a91d-c738-44d2-9117-3289689074bf 214 17 Released 8957 -8960 c679cde3-49b9-4b2e-a415-f5e41a2584ad genebuild_web EXT01 2024-04-24 10:18:17.000000 From cfef61f8-7e24-4ed6-945f-baca1b2664a3 214 11 Released 402 -8961 15392806-6489-4ca8-8dfe-4c068b8cbfc5 thoas_dumps EXT01 2024-04-24 10:18:17.000000 From c679cde3-49b9-4b2e-a415-f5e41a2584ad 214 18 Released 8960 -8962 30b00998-cf48-4e19-87a5-0d77a7d8eab8 browser_files EXT01 2024-04-24 10:18:17.000000 From c679cde3-49b9-4b2e-a415-f5e41a2584ad 214 20 Released 8960 -8963 7405d0be-af0e-4a25-909d-79804221ab66 checksums EXT01 2024-04-24 10:18:17.000000 From c679cde3-49b9-4b2e-a415-f5e41a2584ad 214 23 Released 8960 -8964 63902fec-bfaa-49bd-90c7-38910ad11921 genebuild_web EXT01 2024-04-24 10:18:17.000000 From cfef61f8-7e24-4ed6-945f-baca1b2664a3 214 12 Released 402 -8965 84157ee7-4328-4489-a235-062c76cc6bc9 thoas_load EXT01 2024-04-24 10:18:17.000000 From 63902fec-bfaa-49bd-90c7-38910ad11921 214 19 Released 8964 -8966 b1b03468-d47e-4033-b164-e24a77c1419a genebuild_track EXT01 2024-04-24 10:18:17.000000 From 63902fec-bfaa-49bd-90c7-38910ad11921 214 21 Released 8964 -8967 6f136ae5-07b4-457c-9452-b7272490214a refget_load EXT01 2024-04-24 10:18:17.000000 From 63902fec-bfaa-49bd-90c7-38910ad11921 214 24 Released 8964 -8968 0c0887c2-839a-4df9-b9dc-85c7f1605e16 genebuild_compute EXT01 2024-04-24 10:18:17.000000 From ea69f164-cc77-4671-bf97-c7f537dc400e 217 8 Released 406 -8969 8543b0f0-0798-4af5-9ee2-68ab081ce2f9 xrefs EXT01 2024-04-24 10:18:17.000000 From 0c0887c2-839a-4df9-b9dc-85c7f1605e16 217 13 Released 8968 -8970 915f67e9-46aa-419c-b1f1-3e3fe8d37c9f protein_features EXT01 2024-04-24 10:18:17.000000 From 0c0887c2-839a-4df9-b9dc-85c7f1605e16 217 14 Released 8968 -8971 9429df49-33dd-4c4b-bb1e-a086f9be1311 alpha_fold EXT01 2024-04-24 10:18:17.000000 From 0c0887c2-839a-4df9-b9dc-85c7f1605e16 217 15 Released 8968 -8972 b3979b48-55ad-42a3-9f03-b0e6eb1b8408 genebuild_files EXT01 2024-04-24 10:18:17.000000 From ea69f164-cc77-4671-bf97-c7f537dc400e 217 9 Released 406 -8973 4d258abd-6847-486c-9196-bb4da2a13cd0 blast EXT01 2024-04-24 10:18:17.000000 From b3979b48-55ad-42a3-9f03-b0e6eb1b8408 217 16 Released 8972 -8974 fe818538-d329-4e46-b311-71a13f546eb7 ftp_dumps EXT01 2024-04-24 10:18:17.000000 From b3979b48-55ad-42a3-9f03-b0e6eb1b8408 217 17 Released 8972 -8975 537ac1ca-5839-4d21-8f66-815253a29de8 genebuild_web EXT01 2024-04-24 10:18:17.000000 From ea69f164-cc77-4671-bf97-c7f537dc400e 217 11 Released 406 -8976 e6c747ba-6515-447c-9dec-223a5e7b5ab2 thoas_dumps EXT01 2024-04-24 10:18:17.000000 From 537ac1ca-5839-4d21-8f66-815253a29de8 217 18 Released 8975 -8977 2cd06f2e-8ebb-400a-9cd2-6a313dd67b79 browser_files EXT01 2024-04-24 10:18:17.000000 From 537ac1ca-5839-4d21-8f66-815253a29de8 217 20 Released 8975 -8978 e3282d40-5aec-4970-924c-20fb943324fe checksums EXT01 2024-04-24 10:18:18.000000 From 537ac1ca-5839-4d21-8f66-815253a29de8 217 23 Released 8975 -8979 8d7811aa-63db-480f-b2cd-28fbd0e414e7 genebuild_web EXT01 2024-04-24 10:18:18.000000 From ea69f164-cc77-4671-bf97-c7f537dc400e 217 12 Released 406 -8980 bb86329a-3ff5-4aa1-bccb-fa10866c0400 thoas_load EXT01 2024-04-24 10:18:18.000000 From 8d7811aa-63db-480f-b2cd-28fbd0e414e7 217 19 Released 8979 -8981 58a02fb2-387a-4d93-9798-b8a2ec2a990b genebuild_track EXT01 2024-04-24 10:18:18.000000 From 8d7811aa-63db-480f-b2cd-28fbd0e414e7 217 21 Released 8979 -8982 a9e9630d-7e6c-4a03-8b76-3eea9d58ddd5 refget_load EXT01 2024-04-24 10:18:18.000000 From 8d7811aa-63db-480f-b2cd-28fbd0e414e7 217 24 Released 8979 -8983 b847fdd0-205d-4010-a216-a150eb9dcf62 evidence 1.0 2024-04-24 10:18:18.000000 From bf1f5064-8520-4f19-84e4-449aa6c1c1e2 673 4 Released 1391 -8984 d6a11f1e-41fd-409b-a42c-6bb5eed4536d short_variant 1.0 2024-04-24 10:18:18.000000 From bf1f5064-8520-4f19-84e4-449aa6c1c1e2 673 5 Released 1391 -8985 8ba10f26-694b-4e8d-9888-aaa860581af7 variation_ftp 1.0 2024-04-24 10:18:18.000000 From bf1f5064-8520-4f19-84e4-449aa6c1c1e2 673 29 Released 1391 -8986 62bad9a6-2406-4a93-bc6d-1a3c871fce94 browser_files 1.0 2024-04-24 10:18:18.000000 From bf1f5064-8520-4f19-84e4-449aa6c1c1e2 673 31 Released 1391 -8987 48d069c5-f099-414d-bac1-b682f78a1fde variation_track 1.0 2024-04-24 10:18:18.000000 From bf1f5064-8520-4f19-84e4-449aa6c1c1e2 673 32 Released 1391 -8988 07a02fee-c19f-4c11-b35a-2e2b51d67378 evidence 1.0 2024-04-24 10:18:18.000000 From 5b869bbb-098f-4827-afc0-532a2bc88903 674 4 Released 1392 -8989 cc377779-9dc4-4cfb-ba14-81531a7fb69b short_variant 1.0 2024-04-24 10:18:18.000000 From 5b869bbb-098f-4827-afc0-532a2bc88903 674 5 Released 1392 -8990 d2261995-a526-4728-a6a8-568b17c217d3 variation_ftp 1.0 2024-04-24 10:18:18.000000 From 5b869bbb-098f-4827-afc0-532a2bc88903 674 29 Released 1392 -8991 97b81726-34e3-4dbc-976a-ac2e9fa64c17 browser_files 1.0 2024-04-24 10:18:18.000000 From 5b869bbb-098f-4827-afc0-532a2bc88903 674 31 Released 1392 -8992 49d855db-8570-4a4d-9290-d2fc3b1ba02d variation_track 1.0 2024-04-24 10:18:18.000000 From 5b869bbb-098f-4827-afc0-532a2bc88903 674 32 Released 1392 -8993 a487cac9-c58c-48d9-a0b3-2808346ce541 evidence 1.0 2024-04-24 10:18:18.000000 From 0a0bed83-72c7-4f8a-a1cb-97450ef82495 644 4 Released 1528 -8994 c5dec575-8720-46f3-ae5d-a86f22760b30 short_variant 1.0 2024-04-24 10:18:18.000000 From 0a0bed83-72c7-4f8a-a1cb-97450ef82495 644 5 Released 1528 -8995 05e09f88-0d11-47f8-8db0-92714ce6ba42 variation_ftp 1.0 2024-04-24 10:18:18.000000 From 0a0bed83-72c7-4f8a-a1cb-97450ef82495 644 29 Released 1528 -8996 96da2064-5735-4a91-bd1f-dbeff5548b50 browser_files 1.0 2024-04-24 10:18:18.000000 From 0a0bed83-72c7-4f8a-a1cb-97450ef82495 644 31 Released 1528 -8997 c50d02a2-16a0-44a2-be1a-32b63ab04deb variation_track 1.0 2024-04-24 10:18:18.000000 From 0a0bed83-72c7-4f8a-a1cb-97450ef82495 644 32 Released 1528 -8998 c1829f04-cccd-436d-9f0a-1d82ed117064 evidence 1.0 2024-04-24 10:18:18.000000 From ff7cb333-fc39-4f00-93e0-65a0d5eb596b 653 4 Released 1537 -8999 9bf0c2f7-2dc8-4f27-b578-a1cb277e1a63 short_variant 1.0 2024-04-24 10:18:18.000000 From ff7cb333-fc39-4f00-93e0-65a0d5eb596b 653 5 Released 1537 -9000 f3206998-32e5-465b-8b76-d21e3c24bb18 variation_ftp 1.0 2024-04-24 10:18:18.000000 From ff7cb333-fc39-4f00-93e0-65a0d5eb596b 653 29 Released 1537 -9001 dd7a4ab7-d890-488e-b014-b6e9eacf8a3a browser_files 1.0 2024-04-24 10:18:18.000000 From ff7cb333-fc39-4f00-93e0-65a0d5eb596b 653 31 Released 1537 -9002 263fda44-a84a-4879-912e-4de5e75be0ea variation_track 1.0 2024-04-24 10:18:18.000000 From ff7cb333-fc39-4f00-93e0-65a0d5eb596b 653 32 Released 1537 -9003 963a3baf-5f86-4372-b0ce-79726329ce59 evidence 1.0 2024-04-24 10:18:18.000000 From e659bef9-22f7-4ad2-8215-4a48ecd228df 660 4 Released 1544 -9004 cdabf7a3-e5bf-4afa-ae41-80eeca2ec76d short_variant 1.0 2024-04-24 10:18:18.000000 From e659bef9-22f7-4ad2-8215-4a48ecd228df 660 5 Released 1544 -9005 c708b53e-6fbb-49d9-b9c3-a8a09c3b7f0d variation_ftp 1.0 2024-04-24 10:18:18.000000 From e659bef9-22f7-4ad2-8215-4a48ecd228df 660 29 Released 1544 -9006 029df488-7091-47ed-9db7-e1abf23cc429 browser_files 1.0 2024-04-24 10:18:18.000000 From e659bef9-22f7-4ad2-8215-4a48ecd228df 660 31 Released 1544 -9007 b4863deb-6e3f-4f56-9904-f64cb1783409 variation_track 1.0 2024-04-24 10:18:18.000000 From e659bef9-22f7-4ad2-8215-4a48ecd228df 660 32 Released 1544 -9008 9feb9d70-8966-49f4-a385-3777b66ca2a2 homology_compute 1.0 2024-04-24 10:18:18.000000 From 4b02b11e-397c-4a4f-8c13-8b65efb87030 266 25 Released 2291 -9009 da2808ff-be03-4b52-aeb8-5415e509a8b1 homology_load 1.0 2024-04-24 10:18:18.000000 From 4b02b11e-397c-4a4f-8c13-8b65efb87030 266 26 Released 2291 -9010 9aaf2c4c-f026-4c6c-8b65-c475e3be683a homology_ftp 1.0 2024-04-24 10:18:18.000000 From 4b02b11e-397c-4a4f-8c13-8b65efb87030 266 27 Released 2291 -9011 049cc616-bf6e-4f21-b3b3-6ba0672e79ac homology_compute 1.0 2024-04-24 10:18:18.000000 From aebf0b81-4234-4aa9-85cc-abfe91f5eac2 323 25 Released 2348 -9012 0d4335ae-fcd5-4ac4-a1a7-d766c4dad6bc homology_load 1.0 2024-04-24 10:18:18.000000 From aebf0b81-4234-4aa9-85cc-abfe91f5eac2 323 26 Released 2348 -9013 7642b849-ba92-4f47-960c-9dc07b2488c8 homology_ftp 1.0 2024-04-24 10:18:18.000000 From aebf0b81-4234-4aa9-85cc-abfe91f5eac2 323 27 Released 2348 -9014 b2f53226-7a8f-41cc-bef3-f1dc6d7324ad homology_compute 1.0 2024-04-24 10:18:18.000000 From caac6097-4921-4c10-bfc0-1c3e9b2604dc 332 25 Released 2357 -9015 d174a9a1-39df-4e81-94bd-98db30505730 homology_load 1.0 2024-04-24 10:18:18.000000 From caac6097-4921-4c10-bfc0-1c3e9b2604dc 332 26 Released 2357 -9016 23f92c2d-83a6-43de-87c7-a83aad6406a5 homology_ftp 1.0 2024-04-24 10:18:18.000000 From caac6097-4921-4c10-bfc0-1c3e9b2604dc 332 27 Released 2357 -9017 41b03ede-f8f2-4c4c-b1f7-86d506798835 homology_compute 1.0 2024-04-24 10:18:18.000000 From 9f45f1a6-d4d0-4c02-9509-dec5a0d523fb 359 25 Released 2384 -9018 1e809aac-bfe5-4a89-9472-c2f183205f3b homology_load 1.0 2024-04-24 10:18:18.000000 From 9f45f1a6-d4d0-4c02-9509-dec5a0d523fb 359 26 Released 2384 -9019 8c572ee1-140c-43f4-9c6f-287662793018 homology_ftp 1.0 2024-04-24 10:18:18.000000 From 9f45f1a6-d4d0-4c02-9509-dec5a0d523fb 359 27 Released 2384 -9020 70452894-16f7-4b8c-a780-1dd869bec2fa homology_compute 1.0 2024-04-24 10:18:18.000000 From b67e1761-3341-4965-9a5b-041cb8230cb3 369 25 Released 2394 -9021 ba8bd474-977f-4c52-88b4-7cd99f288f2c homology_load 1.0 2024-04-24 10:18:18.000000 From b67e1761-3341-4965-9a5b-041cb8230cb3 369 26 Released 2394 -9022 8b148701-df21-4cdc-8123-66ba2a7a59e0 homology_ftp 1.0 2024-04-24 10:18:18.000000 From b67e1761-3341-4965-9a5b-041cb8230cb3 369 27 Released 2394 -9023 f1ba169e-f9a4-4a01-80cc-f2e2085a07cb homology_compute 1.0 2024-04-24 10:18:18.000000 From e6df4d05-8567-4143-8ea0-c6ad1b5a3fc2 424 25 Released 2449 -9024 794e6ba8-771e-4a7e-b851-f1b48f20806f homology_load 1.0 2024-04-24 10:18:19.000000 From e6df4d05-8567-4143-8ea0-c6ad1b5a3fc2 424 26 Released 2449 -9025 ae16f773-4395-4908-82f1-d5517b75ace5 homology_ftp 1.0 2024-04-24 10:18:19.000000 From e6df4d05-8567-4143-8ea0-c6ad1b5a3fc2 424 27 Released 2449 -9026 d84af6e2-313c-4930-bbac-8b74def7c6b7 homology_compute 1.0 2024-04-24 10:18:19.000000 From 58df568e-48c1-4a3b-838b-448540392f9c 433 25 Released 2458 -9027 b6156c27-8aef-4172-b150-6d6b27b0f4c6 homology_load 1.0 2024-04-24 10:18:19.000000 From 58df568e-48c1-4a3b-838b-448540392f9c 433 26 Released 2458 -9028 73f4fa74-7d7c-4bb4-a71e-1194ef1c244a homology_ftp 1.0 2024-04-24 10:18:19.000000 From 58df568e-48c1-4a3b-838b-448540392f9c 433 27 Released 2458 -9029 078ff308-f035-4c75-aa83-66e0697da057 browser_files 1.0 2024-04-24 10:18:19.000000 From 679d6452-799c-4a2f-8906-0db6c639e498 670 33 Released 2518 -9030 9b6fc878-fa10-49b6-bd99-98ade5e0252f regulation_track 1.0 2024-04-24 10:18:19.000000 From 679d6452-799c-4a2f-8906-0db6c639e498 670 34 Released 2518 -9031 852986f9-e63a-44b7-b182-7182d7070e1f regulation_ftp 1.0 2024-04-24 10:18:19.000000 From 679d6452-799c-4a2f-8906-0db6c639e498 670 35 Released 2518 -9032 7573b939-da2c-4997-8002-9da717ba79d2 genebuild_compute ENS01 2024-04-24 16:07:22.000000 From 2ef7c056-847e-4742-a68b-18c3ece068aa 18 8 Submitted 38 -9033 7bb8919c-d9e0-4eca-9a49-7a6d9e311c8d xrefs ENS01 2024-04-24 16:07:22.000000 From 7573b939-da2c-4997-8002-9da717ba79d2 18 13 Submitted 9032 -9034 a6a43d07-4ddd-4935-96f3-137882be6b5f protein_features ENS01 2024-04-24 16:07:22.000000 From 7573b939-da2c-4997-8002-9da717ba79d2 18 14 Submitted 9032 -9035 3286e886-cdde-45e2-a92c-2a5b7a43744b alpha_fold ENS01 2024-04-24 16:07:22.000000 From 7573b939-da2c-4997-8002-9da717ba79d2 18 15 Submitted 9032 -9036 a41c7eb3-8dd9-4449-bef3-8a2798d324c9 genebuild_files ENS01 2024-04-24 16:07:22.000000 From 2ef7c056-847e-4742-a68b-18c3ece068aa 18 9 Submitted 38 -9037 384e30bb-1940-475b-a7f1-94c3b5fa6251 blast ENS01 2024-04-24 16:07:22.000000 From a41c7eb3-8dd9-4449-bef3-8a2798d324c9 18 16 Submitted 9036 -9038 b0d8755a-d01b-4910-b84c-0e15ef1293ba ftp_dumps ENS01 2024-04-24 16:07:22.000000 From a41c7eb3-8dd9-4449-bef3-8a2798d324c9 18 17 Submitted 9036 -9039 3666e777-8cb5-420e-8f45-7469253db5f6 genebuild_web ENS01 2024-04-24 16:07:22.000000 From 2ef7c056-847e-4742-a68b-18c3ece068aa 18 11 Submitted 38 -9040 282e982f-493b-4f13-a927-3f9e3dc9a8a8 thoas_dumps ENS01 2024-04-24 16:07:22.000000 From 3666e777-8cb5-420e-8f45-7469253db5f6 18 18 Submitted 9039 -9041 f39ac854-157a-48a8-8b81-4345391c59c3 browser_files ENS01 2024-04-24 16:07:22.000000 From 3666e777-8cb5-420e-8f45-7469253db5f6 18 20 Submitted 9039 -9042 6a86ad7d-67d8-4c0b-a504-966225539fc0 checksums ENS01 2024-04-24 16:07:22.000000 From 3666e777-8cb5-420e-8f45-7469253db5f6 18 23 Submitted 9039 -9043 a128c1b9-6f98-40cf-a3ae-321d5e4e1106 genebuild_compute ENS01 2024-04-24 16:07:22.000000 From 11a0be7f-99ae-45d3-a004-dc19bb562330 100 8 Processed 184 -9044 65bacf69-42d4-439c-a436-f76208677771 xrefs ENS01 2024-04-24 16:07:22.000000 From a128c1b9-6f98-40cf-a3ae-321d5e4e1106 100 13 Processed 9043 -9045 d4716792-c4af-4ec9-a14c-220f4768ed88 protein_features ENS01 2024-04-24 16:07:22.000000 From a128c1b9-6f98-40cf-a3ae-321d5e4e1106 100 14 Processed 9043 -9046 17bc6764-fc30-4fe3-8cfc-18d10e5357d3 alpha_fold ENS01 2024-04-24 16:07:22.000000 From a128c1b9-6f98-40cf-a3ae-321d5e4e1106 100 15 Processed 9043 -9047 7e8844b3-733d-4962-a144-70e8cc69a3a6 genebuild_files ENS01 2024-04-24 16:07:22.000000 From 11a0be7f-99ae-45d3-a004-dc19bb562330 100 9 Processed 184 -9048 bec4dc62-aac5-4993-98ef-da92da3c3975 blast ENS01 2024-04-24 16:07:22.000000 From 7e8844b3-733d-4962-a144-70e8cc69a3a6 100 16 Processed 9047 -9049 a187630f-56ea-4012-b10f-96d4eee7e280 ftp_dumps ENS01 2024-04-24 16:07:22.000000 From 7e8844b3-733d-4962-a144-70e8cc69a3a6 100 17 Processed 9047 -9050 372c9ef4-5068-491e-bc9d-f173de3779d4 genebuild_web ENS01 2024-04-24 16:07:22.000000 From 11a0be7f-99ae-45d3-a004-dc19bb562330 100 11 Processed 184 -9051 1f9b9cf6-af64-49da-9f6a-e91fcfe3748b thoas_dumps ENS01 2024-04-24 16:07:22.000000 From 372c9ef4-5068-491e-bc9d-f173de3779d4 100 18 Processed 9050 -9052 14224fd8-39fc-4ce4-955a-9ceb53b7fe17 browser_files ENS01 2024-04-24 16:07:22.000000 From 372c9ef4-5068-491e-bc9d-f173de3779d4 100 20 Processed 9050 -9053 a2d8c490-6152-4d44-8cd6-6318be80c6d0 checksums ENS01 2024-04-24 16:07:22.000000 From 372c9ef4-5068-491e-bc9d-f173de3779d4 100 23 Processed 9050 -9054 569f3264-1e67-474a-bcca-d1f971bdfb6d genebuild_compute ENS01 2024-04-24 16:07:22.000000 From bd63a676-45ff-494a-b26f-2b779cb6c180 179 8 Processed 338 -9055 2f75afb7-07b9-4f26-914b-447609ae9661 xrefs ENS01 2024-04-24 16:07:22.000000 From 569f3264-1e67-474a-bcca-d1f971bdfb6d 179 13 Processed 9054 -9056 4030627d-9f93-418b-b162-ef4c4ea7187c protein_features ENS01 2024-04-24 16:07:22.000000 From 569f3264-1e67-474a-bcca-d1f971bdfb6d 179 14 Processed 9054 -9057 03698609-b0cc-4ebc-ba4e-9c1839c07375 alpha_fold ENS01 2024-04-24 16:07:22.000000 From 569f3264-1e67-474a-bcca-d1f971bdfb6d 179 15 Processed 9054 -9058 5d221954-3022-434b-8167-4837bcb83cdf genebuild_files ENS01 2024-04-24 16:07:22.000000 From bd63a676-45ff-494a-b26f-2b779cb6c180 179 9 Processed 338 -9059 0a2ed457-8cb8-4c32-9670-f2d29accf899 blast ENS01 2024-04-24 16:07:22.000000 From 5d221954-3022-434b-8167-4837bcb83cdf 179 16 Processed 9058 -9060 b8aa05bd-9375-49aa-bdd4-2cd81dded467 ftp_dumps ENS01 2024-04-24 16:07:22.000000 From 5d221954-3022-434b-8167-4837bcb83cdf 179 17 Processed 9058 -9061 032c5450-032b-4bd2-91e5-8b00482bb51f genebuild_web ENS01 2024-04-24 16:07:22.000000 From bd63a676-45ff-494a-b26f-2b779cb6c180 179 11 Processed 338 -9062 900b59d2-8ac7-4e41-a588-a4314dadfe9d thoas_dumps ENS01 2024-04-24 16:07:22.000000 From 032c5450-032b-4bd2-91e5-8b00482bb51f 179 18 Processed 9061 -9063 c813b3e5-9756-4431-86b9-1d78e3242ffc browser_files ENS01 2024-04-24 16:07:22.000000 From 032c5450-032b-4bd2-91e5-8b00482bb51f 179 20 Processed 9061 -9064 6677ee8d-e814-4991-87b1-967d752652f0 checksums ENS01 2024-04-24 16:07:22.000000 From 032c5450-032b-4bd2-91e5-8b00482bb51f 179 23 Processed 9061 -9065 b60e4ed3-9260-42fd-bb44-648e2240c0fd homology_compute 1.0 2024-04-24 16:07:22.000000 From f2734f34-36a0-4594-871d-f7f6d317d05a 429 25 Submitted 2454 -9066 ef13256d-516a-475d-9769-8ec0b487c39a homology_compute 1.0 2024-04-24 16:07:22.000000 From f32b7f9a-97fd-41cd-86be-a5fb5becd335 469 25 Processed 2494 -9067 6246ba7c-ae8c-4e66-b696-0aaceb586d75 homology_compute 1.0 2024-04-24 16:07:22.000000 From f93d21ca-9a24-4c31-ae11-b0f8d3deab6d 423 25 Submitted 6849 -9068 23bac8a9-553f-4e00-85f3-2844d6634364 homology_compute 1.0 2024-04-24 16:07:22.000000 From 5b618784-a5ff-46cc-8102-b082ffb6e447 368 25 Submitted 8130 -9069 dc06cef3-40c1-4924-82aa-d95003b033d0 homology_compute 1.0 2024-04-24 16:07:22.000000 From a5bf42be-63c1-4616-9af1-bc03aea92643 443 25 Submitted 8661 -9070 bf1f5064-8520-abcd-84e4-449aa6c1c1e2 variation 2.0 2023-11-09 12:49:25.273751 GRCh38 673 3 Faulty \N -9071 bf1f5064-8520-abcd-84e4-449aa6c221e2 variation 2.0 2023-11-09 12:49:25.273751 GRCh38 673 3 Processing \N -9072 99999999-847e-4742-a68b-18c3ece068aa genebuild ENS01 2023-09-22 15:03:02.000000 GCA_021950905.1_ENS01 18 2 Submitted \N -9073 99999999-da2c-4997-8002-9da717ba79d2 genebuild_compute ENS01 2024-04-24 16:07:22.000000 From 2ef7c056-847e-4742-a68b-18c3ece068aa 18 8 Submitted 9072 -9074 99999999-d9e0-4eca-9a49-7a6d9e311c8d xrefs ENS01 2024-04-24 16:07:22.000000 From 7573b939-da2c-4997-8002-9da717ba79d2 18 13 Submitted 9073 diff --git a/src/tests/databases/ensembl_genome_metadata/dataset_attribute.txt b/src/tests/databases/ensembl_genome_metadata/dataset_attribute.txt deleted file mode 100644 index acce8478..00000000 --- a/src/tests/databases/ensembl_genome_metadata/dataset_attribute.txt +++ /dev/null @@ -1,2246 +0,0 @@ -201067 GCA_000005845.2 1 1 -201085 1 2 1 -201084 1 3 1 -201078 2013-11 5 1 -201066 ASM584v2 6 1 -201083 50.79 7 1 -201087 1 8 1 -201077 complete genome 9 1 -201071 chromosome:ASM584v2#contig 10 1 -201068 ASM584v2 11 1 -201069 Univ. Wisconsin 12 1 -201070 http://www.ebi.ac.uk/ena/data/view/GCA_000005845 13 1 -201072 0 14 1 -201086 1 16 1 -201081 3977025 17 1 -201074 0 18 1 -201082 4641652 19 1 -201075 ASM584v2 81 1 -201007 938.55 21 2 -201032 1.00 22 2 -201035 1.00 23 2 -201029 937.89 24 2 -201038 939.17 25 2 -201011 939.92 26 2 -201027 1.67 27 2 -201030 939.92 28 2 -201008 4240 29 2 -201031 4239 30 2 -201039 1.00 31 2 -201018 d41d8cd98f00b204e9800998ecf8427e 32 2 -201015 2006-02 33 2 -201016 2018-09 34 2 -201012 toplevel 35 2 -201028 7077 36 2 -201017 import 37 2 -201062 Import 38 2 -201048 1.00 39 2 -201043 269.83 40 2 -201040 269.83 41 2 -201049 269.83 42 2 -201044 2905 43 2 -201020 0 44 2 -201019 0 45 2 -201047 179 46 2 -201046 53 47 2 -201042 179 48 2 -201023 0 49 2 -201041 179 50 2 -201045 1.00 51 2 -201053 1.17 52 2 -201060 783.57 53 2 -201057 1103.18 54 2 -201059 1150.95 55 2 -201051 913.03 56 2 -201055 8622 57 2 -201058 115 58 2 -201050 51 59 2 -201056 134 60 2 -201052 19 61 2 -201054 115 62 2 -201061 1.00 63 2 -201009 42 64 2 -201013 2018-09-UnivWisconsin 65 2 -201034 4242 66 2 -201010 4245 67 2 -201036 3 68 2 -201033 4242 69 2 -201037 1.00 70 2 -201014 EXT01 71 2 -201063 b2992 72 2 -201064 Chromosome:3140311-3140799 73 2 -201025 ENA 84 2 -201024 https://ebi.ac.uk/ena 85 2 -201026 community 169 2 -201022 179 170 2 -187255 GCA_018473315.1 1 7 -187269 512 3 7 -187272 34159233 4 7 -187254 2021-05 5 7 -187256 HG03540.alt.pat.f1_v2 6 7 -187268 40.83 7 7 -187265 scaffold 9 7 -187257 HG03540.alt.pat.f1_v2 11 7 -187273 1 14 7 -187264 NULL 15 7 -187274 512 16 7 -187266 34182815 17 7 -187271 26 18 7 -187267 3065276644 19 7 -187260 high 74 7 -187258 NCBI 75 7 -187259 INSDC Assembly ID 76 7 -187337 GCA_018469415.1 1 9 -187351 369 3 9 -187354 55482364 4 9 -187336 2021-05 5 9 -187338 HG03516.alt.pat.f1_v2 6 9 -187350 40.85 7 9 -187347 scaffold 9 9 -187339 HG03516.alt.pat.f1_v2 11 9 -187355 1 14 9 -187346 NULL 15 9 -187356 369 16 9 -187348 34407519 17 9 -187353 26 18 9 -187349 3067004974 19 9 -187342 high 74 9 -187340 NCBI 75 9 -187341 INSDC Assembly ID 76 9 -187419 GCA_018469875.1 1 11 -187434 1 2 11 -187433 292 3 11 -187436 60041455 4 11 -187418 2021-05 5 11 -187420 HG02622.pri.mat.f1_v2 6 11 -187432 40.86 7 11 -187429 contig 9 11 -187421 HG02622.pri.mat.f1_v2 11 11 -187428 NULL 15 11 -187438 292 16 11 -187430 34180661 17 11 -187431 3046105980 19 11 -187424 high 74 11 -187422 NCBI 75 11 -187423 INSDC Assembly ID 76 11 -187501 GCA_018505825.1 1 13 -187516 1 2 13 -187515 445 3 13 -187518 23115113 4 13 -187500 2021-05 5 13 -187502 HG02109.pri.mat.f1_v2 6 13 -187514 40.86 7 13 -187511 contig 9 13 -187503 HG02109.pri.mat.f1_v2 11 13 -187510 NULL 15 13 -187520 445 16 13 -187512 34073218 17 13 -187513 3026115826 19 13 -187506 high 74 13 -187504 NCBI 75 13 -187505 INSDC Assembly ID 76 13 -187453 1163.76 21 14 -187459 7.79 22 14 -187462 7.79 23 14 -187456 149.30 24 14 -187465 247.24 25 14 -187449 66114.95 26 14 -187454 6200.76 27 14 -187457 3474.93 28 14 -187452 19809 29 14 -187458 104402 30 14 -187466 5.27 31 14 -187441 2022-08 33 14 -187442 2022-07 34 14 -187447 toplevel 35 14 -187455 2474686 36 14 -187445 projection_build 37 14 -187446 Mapping from reference 38 14 -187478 3.30 39 14 -187472 350.03 40 14 -187468 21565.87 41 14 -187480 1070.54 42 14 -187473 1375430 43 14 -187477 17068 44 14 -187479 2299 45 14 -187476 24448 46 14 -187475 4 47 14 -187470 5081 48 14 -187467 125785 49 14 -187469 54723 50 14 -187474 2.24 51 14 -187484 2.36 52 14 -187491 354.76 53 14 -187488 4120.64 54 14 -187490 4308.30 55 14 -187482 804.48 56 14 -187486 909446 57 14 -187489 15648 58 14 -187481 2 59 14 -187487 44991 60 14 -187483 25925 61 14 -187485 19066 62 14 -187492 1.22 63 14 -187451 8 64 14 -187440 2022-08-Ensembl 65 14 -187461 813720 66 14 -187450 1247235 67 14 -187463 1087119 68 14 -187460 160116 69 14 -187464 8.08 70 14 -187448 ENS01 71 14 -187444 39 77 14 -187471 15995.34 78 14 -187443 homo_sapiens_core_104_38 79 14 -187494 Ensembl 84 14 -187495 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 14 -187493 ensembl 169 14 -187496 180508 170 14 -187661 GCA_018852615.1 1 17 -187676 1 2 17 -187675 445 3 17 -187678 72699918 4 17 -187660 2021-06 5 17 -187662 HG002.pri.mat.f1_v2 6 17 -187674 40.85 7 17 -187671 scaffold 9 17 -187663 HG002.pri.mat.f1_v2 11 17 -187679 6 14 17 -187670 NULL 15 17 -187680 445 16 17 -187672 34165134 17 17 -187677 372 18 17 -187673 3060609068 19 17 -187666 high 74 17 -187664 NCBI 75 17 -187665 INSDC Assembly ID 76 17 -193584 GCA_000002765.2 1 23 -193597 14 2 23 -193596 14 3 23 -193591 2017-10 5 23 -193585 ASM276v2 6 23 -193595 19.34 7 23 -193602 1 8 23 -193590 complete genome 9 23 -193586 chromosome:ASM276v2#contig 10 23 -193587 ASM276v2 11 23 -193588 Naval Medical Research Institute 12 23 -193589 http://www.ebi.ac.uk/ena/data/view/GCA_000002765 13 23 -193601 14 16 23 -193593 12309897 17 23 -193594 23292622 19 23 -193605 ASM276v2 81 23 -193538 2297.48 21 24 -193544 2.64 22 24 -193547 2.64 23 24 -193541 870.63 24 24 -193550 870.38 25 24 -193534 2569.88 26 24 -193539 167.04 27 24 -193542 2296.23 28 24 -193537 5362 29 24 -193543 5358 30 24 -193551 1.00 31 24 -193526 d41d8cd98f00b204e9800998ecf8427e 32 24 -193527 2002-10 33 24 -193528 2017-10 34 24 -193529 toplevel 35 24 -193540 30864 36 24 -193530 import 37 24 -193532 Import 38 24 -193563 1.00 39 24 -193557 553.68 40 24 -193553 553.68 41 24 -193565 553.68 42 24 -193558 6175 43 24 -193561 252 46 24 -193560 68 47 24 -193555 252 48 24 -193554 252 50 24 -193559 1.00 51 24 -193569 3.01 52 24 -193576 520.40 53 24 -193573 1665.66 54 24 -193575 48.51 55 24 -193567 1568.01 56 24 -193571 11291 57 24 -193574 153 58 24 -193566 72 59 24 -193572 461 60 24 -193568 308 61 24 -193570 153 62 24 -193577 1.00 63 24 -193536 90 64 24 -193531 2017-10-ENA 65 24 -193546 14139 66 24 -193535 14146 67 24 -193548 8784 68 24 -193545 5362 69 24 -193549 1.00 70 24 -193533 EXT01 71 24 -193581 PF3D7_0532500 72 24 -193582 5:1316342-1316846 73 24 -193578 ENA 84 24 -193579 https://ebi.ac.uk/ena 85 24 -193580 252 170 24 -189538 GCA_021950905.1 1 37 -189553 23 2 37 -189552 514 3 37 -189555 84927121 4 37 -189537 2022-02 5 37 -189539 HG002.pat.cur.20211005 6 37 -189551 40.85 7 37 -189548 chromosome 9 37 -189540 HG002.pat.cur.20211005 11 37 -189556 117 14 37 -189547 NULL 15 37 -189557 514 16 37 -189549 32920283 17 37 -189554 966735 18 37 -189550 2959277077 19 37 -189543 high 74 37 -189541 NCBI 75 37 -189542 INSDC Assembly ID 76 37 -189490 1162.01 21 38 -189496 7.81 22 38 -189499 7.79 23 38 -189493 148.84 24 38 -189502 246.40 25 38 -189486 66612.77 26 38 -189491 6190.13 27 38 -189494 3492.51 28 38 -189489 19027 29 38 -189495 101295 30 38 -189503 5.32 31 38 -189478 2022-08 33 38 -189479 2022-07 34 38 -189484 toplevel 35 38 -189492 2481019 36 38 -189482 projection_build 37 38 -189483 Mapping from reference 38 38 -189515 3.30 39 38 -189509 347.19 40 38 -189505 21661.31 41 38 -189517 1062.19 42 38 -189510 1375441 43 38 -189514 16813 44 38 -189516 2218 45 38 -189513 23848 46 38 -189512 4 47 38 -189507 4817 48 38 -189504 123368 49 38 -189506 53525 50 38 -189511 2.24 51 38 -189521 2.41 52 38 -189528 349.77 53 38 -189525 4335.99 54 38 -189527 4378.19 55 38 -189519 809.54 56 38 -189523 909725 57 38 -189526 15206 58 38 -189518 7 59 38 -189524 44601 60 38 -189520 26063 61 38 -189522 18538 62 38 -189529 1.22 63 38 -189488 8 64 38 -189477 2022-10-Ensembl 65 38 -189498 790726 66 38 -189487 1213018 67 38 -189500 1057228 68 38 -189497 155790 69 38 -189501 8.19 70 38 -189485 ENS01 71 38 -189481 51 77 38 -189508 16006.39 78 38 -189480 homo_sapiens_core_104_38 79 38 -189531 Ensembl 84 38 -189532 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 38 -189530 ensembl 169 38 -189533 176893 170 38 -192935 GCA_000001405.14 1 61 -192946 25 2 61 -192945 27948 3 61 -192948 38440852 4 61 -192931 2009-02 5 61 -192930 GRCh37 6 61 -192944 37.78 7 61 -192951 chromosome 9 61 -192933 chromosome:GRCh37#chromosome:GRCh38 10 61 -192932 GRCh37.p13 11 61 -192953 GENCODE 12 61 -192949 242 14 61 -192950 297 16 61 -192942 33914472 17 61 -192947 243146473 18 61 -192943 3234834689 19 61 -192940 hg19 20 61 -192936 high 74 61 -192937 NCBI 75 61 -192938 GenBank Assembly ID 76 61 -192934 Genome Reference Consortium Human Reference 37 80 61 -192939 GRCh37 81 61 -192886 1141.09 21 62 -192892 7.60 22 62 -192895 7.34 23 62 -192889 150.21 24 62 -192898 231.84 25 62 -192882 63989.10 26 62 -192887 6220.21 27 62 -192890 3136.38 28 62 -192885 20787 29 62 -192891 95346 30 62 -192899 4.59 31 62 -192875 2011-04 33 62 -192876 2013-09 34 62 -192878 toplevel 35 62 -192888 2304638 36 62 -192877 full_genebuild 37 62 -192880 Manual annotation 38 62 -192911 2.51 39 62 -192905 289.17 40 62 -192901 15206.48 41 62 -192913 712.20 42 62 -192906 1536213 43 62 -192910 13860 44 62 -192912 2038 45 62 -192909 22948 46 62 -192908 35 47 62 -192903 7050 48 62 -192900 49857 49 62 -192902 32971 50 62 -192907 1.44 51 62 -192917 2.31 52 62 -192924 362.52 53 62 -192921 3525.30 54 62 -192923 3049.46 55 62 -192915 809.46 56 62 -192919 586570 57 62 -192922 14170 58 62 -192914 23 59 62 -192920 40183 60 62 -192916 22805 61 62 -192918 17378 62 62 -192925 1.23 63 62 -192884 8 64 62 -192874 2010-07-Ensembl 65 62 -192894 724218 66 62 -192883 1072753 67 62 -192896 926601 68 62 -192893 146152 69 62 -192897 7.03 70 62 -192881 GENCODE19 71 62 -192927 ENSG00000139618 72 62 -192928 17:64155265-64255266 73 62 -192873 39 77 62 -192904 12156.05 78 62 -192879 2013-07-22 82 62 -192926 82828 170 62 -197309 GCA_900519105.1 1 147 -197323 22 2 147 -197322 22 3 147 -197325 51842 4 147 -197310 2018-07 5 147 -197311 IWGSC 6 147 -197321 45.18 7 147 -197329 1 8 147 -197316 chromosome 9 147 -197312 chromosome:IWGSC#scaffold:TGACv1 10 147 -197313 iwgsc_refseqv1.0 11 147 -197314 International Wheat Genome Sequencing Consortium 12 147 -197315 https://www.ebi.ac.uk/ena/data/view/GCA_900519105.1 13 147 -197326 692976 14 147 -197327 22 16 147 -197319 133312441 17 147 -197324 275682619 18 147 -197320 14547261565 19 147 -197328 IWGSC 81 147 -197261 1332.42 21 148 -197267 5.34 22 148 -197270 5.60 23 148 -197264 249.47 24 148 -197273 303.33 25 148 -197257 3488.91 26 148 -197262 491.21 27 148 -197265 1570.98 28 148 -197260 107891 29 148 -197266 133346 30 148 -197274 1.24 31 148 -197252 c1cecf20d9c2ae3e2f568924e212ab27 32 148 -197253 toplevel 35 148 -197263 124945 36 148 -197254 import 37 148 -197301 Import 38 148 -197286 1.00 39 148 -197280 149.42 40 148 -197276 149.42 41 148 -197288 149.42 42 148 -197281 5792 43 148 -197285 362 44 148 -197284 12853 46 148 -197283 42 47 148 -197278 12491 48 148 -197277 12853 50 148 -197282 1.00 51 148 -197259 54 64 148 -197255 2018-04-IWGSC 65 148 -197269 712204 66 148 -197258 749233 67 148 -197271 615489 68 148 -197268 133744 69 148 -197272 1.24 70 148 -197256 EXT01 71 148 -197306 TraesCS3D02G273600 72 148 -197307 3D:2585940-2634711 73 148 -197303 PGSB 84 148 -197304 https://www.helmholtz-munich.de/en/pgsb 85 148 -197302 community 169 148 -197305 12853 170 148 -193402 GCA_000001405.29 1 171 -193432 25 2 171 -193430 36829 3 171 -193436 54806562 4 171 -193396 2013-12 5 171 -193400 GRCh38 6 171 -193428 38.88 7 171 -193412 1 8 171 -193420 chromosome 9 171 -193392 chromosome:NCBI36#chromosome:NCBI35 10 171 -193397 GRCh38.p14 11 171 -193414 Genome Reference Consortium 12 171 -193416 https://www.ncbi.nlm.nih.gov/grc 13 171 -193438 663 14 171 -193440 709 16 171 -193424 34493611 17 171 -193434 161611139 18 171 -193426 3298912062 19 171 -193410 hg38 20 171 -193393 high 74 171 -193404 NCBI 75 171 -193406 GenBank Assembly ID 76 171 -193408 Genome Reference Consortium Human Build 38 80 171 -193418 GRCh38 81 171 -193304 1191.97 21 172 -193316 7.98 22 172 -193321 8.13 23 172 -193310 149.38 24 172 -193326 250.15 25 172 -193296 67396.48 26 172 -193306 6172.48 27 172 -193312 3566.92 28 172 -193302 20481 29 172 -193314 111076 30 172 -193329 5.42 31 172 -193284 2014-07 33 172 -193281 2023-03 34 172 -193282 ensembl 169 172 -193292 toplevel 35 172 -193308 2473539 36 172 -193286 full_genebuild 37 172 -193290 Manual annotation 38 172 -193353 3.50 39 172 -193340 339.13 40 172 -193334 22981.34 41 172 -193357 967.28 42 172 -193342 1375317 43 172 -193351 18874 44 172 -193355 2221 45 172 -193349 25959 46 172 -193346 41 47 172 -193337 4864 48 172 -193331 160555 49 172 -193335 64262 50 172 -193344 2.48 51 172 -193366 2.11 52 172 -193380 371.37 53 172 -193374 3412.92 54 172 -193378 4117.36 55 172 -193361 725.47 56 172 -193370 909387 57 172 -193376 15239 58 172 -193360 23 59 172 -193373 35229 60 172 -193363 18526 61 172 -193368 16703 62 172 -193382 1.10 63 172 -193300 8 64 172 -193283 2014-01-Ensembl 65 172 -193320 886243 66 172 -193298 1388435 67 172 -193323 1217602 68 172 -193318 170833 69 172 -193325 8.34 70 172 -193294 GENCODE44 71 172 -193385 ENSG00000221914 72 172 -193387 8:26291508-26372680 73 172 -193279 39 77 172 -193339 14932.57 78 172 -193288 19-12-2022 82 172 -193384 224817 170 172 -199440 GCA_018505865.1 1 177 -199454 481 3 177 -199457 24098322 4 177 -199439 2021-05 5 177 -199441 HG02109.alt.pat.f1_v2 6 177 -199453 40.84 7 177 -199450 scaffold 9 177 -199442 HG02109.alt.pat.f1_v2 11 177 -199458 2 14 177 -199449 NULL 15 177 -199459 481 16 177 -199451 34164067 17 177 -199456 70 18 177 -199452 3037645976 19 177 -199445 high 74 177 -199443 NCBI 75 177 -199444 INSDC Assembly ID 76 177 -199392 1164.26 21 178 -199398 7.80 22 178 -199401 7.79 23 178 -199395 149.30 24 178 -199404 247.32 25 178 -199388 65963.23 26 178 -199393 6193.22 27 178 -199396 3473.08 28 178 -199391 19876 29 178 -199397 104530 30 178 -199405 5.26 31 178 -199380 2022-08 33 178 -199381 2022-07 34 178 -199386 toplevel 35 178 -199394 2475405 36 178 -199384 projection_build 37 178 -199385 Mapping from reference 38 178 -199417 3.29 39 178 -199411 350.32 40 178 -199407 21621.28 41 178 -199419 1066.54 42 178 -199412 1375866 43 178 -199416 17097 44 178 -199418 2291 45 178 -199415 24484 46 178 -199414 4 47 178 -199409 5096 48 178 -199406 125264 49 178 -199408 54631 50 178 -199413 2.23 51 178 -199423 2.38 52 178 -199430 353.31 53 178 -199427 4232.62 54 178 -199429 4367.32 55 178 -199421 807.42 56 178 -199425 909934 57 178 -199428 15617 58 178 -199420 2 59 178 -199426 45313 60 178 -199422 26240 61 178 -199424 19073 62 178 -199431 1.22 63 178 -199390 8 64 178 -199379 2022-08-Ensembl 65 178 -199400 815082 66 178 -199389 1248811 67 178 -199402 1088549 68 178 -199399 160262 69 178 -199403 8.06 70 178 -199387 ENS01 71 178 -199383 39 77 178 -199410 16095.84 78 178 -199382 homo_sapiens_core_104_38 79 178 -199433 Ensembl 84 178 -199434 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 178 -199432 ensembl 169 178 -199435 179895 170 178 -193993 GCA_018852605.1 1 183 -194007 610 3 183 -194010 84969781 4 183 -193992 2021-06 5 183 -193994 HG002.alt.pat.f1_v2 6 183 -194006 40.87 7 183 -194003 scaffold 9 183 -193995 HG002.alt.pat.f1_v2 11 183 -194011 5 14 183 -194002 NULL 15 183 -194012 610 16 183 -194004 32931770 17 183 -194009 256 18 183 -194005 2958633312 19 183 -193998 high 74 183 -193996 NCBI 75 183 -193997 INSDC Assembly ID 76 183 -193945 1162.00 21 184 -193951 7.80 22 184 -193954 7.78 23 184 -193948 148.90 24 184 -193957 246.44 25 184 -193941 66617.42 26 184 -193946 6191.91 27 184 -193949 3493.15 28 184 -193944 19028 29 184 -193950 101278 30 184 -193958 5.32 31 184 -193933 2022-08 33 184 -193934 2022-07 34 184 -193939 toplevel 35 184 -193947 2481019 36 184 -193937 projection_build 37 184 -193938 Mapping from reference 38 184 -193970 3.30 39 184 -193964 347.32 40 184 -193960 21645.12 41 184 -193972 1061.66 42 184 -193965 1375441 43 184 -193969 16820 44 184 -193971 2220 45 184 -193968 23857 46 184 -193967 4 47 184 -193962 4817 48 184 -193959 123159 49 184 -193961 53487 50 184 -193966 2.24 51 184 -193976 2.43 52 184 -193983 347.41 53 184 -193980 4352.59 54 184 -193982 4365.72 55 184 -193974 811.39 56 184 -193978 909725 57 184 -193981 15198 58 184 -193973 7 59 184 -193979 45295 60 184 -193975 26650 61 184 -193977 18645 62 184 -193984 1.23 63 184 -193943 8 64 184 -193932 2022-08-Ensembl 65 184 -193953 790264 66 184 -193942 1212568 67 184 -193955 1056783 68 184 -193952 155785 69 184 -193956 8.19 70 184 -193940 ENS01 71 184 -193936 32 77 184 -193963 16006.56 78 184 -193935 homo_sapiens_core_104_38 79 184 -193986 Ensembl 84 184 -193987 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 184 -193985 ensembl 169 184 -193988 176646 170 184 -192770 GCA_018469925.1 1 197 -192784 270 3 197 -192787 51206351 4 197 -192769 2021-05 5 197 -192771 HG02622.alt.pat.f1_v2 6 197 -192783 40.84 7 197 -192780 scaffold 9 197 -192772 HG02622.alt.pat.f1_v2 11 197 -192788 1 14 197 -192779 NULL 15 197 -192789 270 16 197 -192781 34116085 17 197 -192786 27 18 197 -192782 3043426064 19 197 -192775 high 74 197 -192773 NCBI 75 197 -192774 INSDC Assembly ID 76 197 -202291 GCA_018469425.1 1 249 -202306 1 2 249 -202305 320 3 249 -202308 44773628 4 249 -202290 2021-05 5 249 -202292 HG03516.pri.mat.f1_v2 6 249 -202304 40.85 7 249 -202301 contig 9 249 -202293 HG03516.pri.mat.f1_v2 11 249 -202300 NULL 15 249 -202310 320 16 249 -202302 34212357 17 249 -202303 3033479640 19 249 -202296 high 74 249 -202294 NCBI 75 249 -202295 INSDC Assembly ID 76 249 -202243 1163.20 21 250 -202249 7.79 22 250 -202252 7.79 23 250 -202246 149.22 24 250 -202255 247.21 25 250 -202239 66186.33 26 250 -202244 6201.42 27 250 -202247 3474.20 28 250 -202242 19890 29 250 -202248 104723 30 250 -202256 5.27 31 250 -202231 2022-08 33 250 -202232 2022-07 34 250 -202237 toplevel 35 250 -202245 2475163 36 250 -202235 projection_build 37 250 -202236 Mapping from reference 38 250 -202268 3.30 39 250 -202262 350.95 40 250 -202258 21740.82 41 250 -202270 1075.58 42 250 -202263 1375855 43 250 -202267 17173 44 250 -202269 2288 45 250 -202266 24496 46 250 -202265 4 47 250 -202260 5035 48 250 -202257 125629 49 250 -202259 54633 50 250 -202264 2.23 51 250 -202274 2.37 52 250 -202281 355.21 53 250 -202278 4124.86 54 250 -202280 4302.59 55 250 -202272 809.71 56 250 -202276 909733 57 250 -202279 15708 58 250 -202271 2 59 250 -202277 45203 60 250 -202273 26114 61 250 -202275 19089 62 250 -202282 1.22 63 250 -202241 8 64 250 -202230 2022-08-Ensembl 65 250 -202251 816265 66 250 -202240 1250909 67 250 -202253 1090338 68 250 -202250 160571 69 250 -202254 8.07 70 250 -202238 ENS01 71 250 -202234 32 77 250 -202261 16131.44 78 250 -202233 homo_sapiens_core_104_38 79 250 -202284 Ensembl 84 250 -202285 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 250 -202283 ensembl 169 250 -202286 180262 170 250 -201391 GCA_021951015.1 1 337 -201406 23 2 337 -201405 355 3 337 -201408 62879740 4 337 -201390 2022-02 5 337 -201392 HG002.mat.cur.20211005 6 337 -201404 40.83 7 337 -201401 chromosome 9 337 -201393 HG002.mat.cur.20211005 11 337 -201409 109 14 337 -201400 NULL 15 337 -201410 355 16 337 -201402 34181334 17 337 -201407 1510078 18 337 -201403 3061735012 19 337 -201396 high 74 337 -201394 NCBI 75 337 -201395 INSDC Assembly ID 76 337 -201343 1163.22 21 338 -201349 7.80 22 338 -201352 7.79 23 338 -201346 149.12 24 338 -201355 247.16 25 338 -201339 66336.70 26 338 -201344 6211.72 27 338 -201347 3479.81 28 338 -201342 19845 29 338 -201348 104818 30 338 -201356 5.28 31 338 -201331 2022-08 33 338 -201332 2022-07 34 338 -201337 toplevel 35 338 -201345 2474673 36 338 -201335 projection_build 37 338 -201336 Mapping from reference 38 338 -201368 3.30 39 338 -201362 350.54 40 338 -201358 21646.34 41 338 -201370 1070.35 42 338 -201363 1374470 43 338 -201367 17156 44 338 -201369 2312 45 338 -201366 24585 46 338 -201365 4 47 338 -201360 5117 48 338 -201357 126258 49 338 -201359 54865 50 338 -201364 2.23 51 338 -201374 2.36 52 338 -201381 356.20 53 338 -201378 4119.05 54 338 -201380 4279.70 55 338 -201372 811.80 56 338 -201376 909485 57 338 -201379 15774 58 338 -201371 9 59 338 -201377 45260 60 338 -201373 26117 61 338 -201375 19143 62 338 -201382 1.21 63 338 -201341 8 64 338 -201330 2022-10-Ensembl 65 338 -201351 817565 66 338 -201340 1252957 67 338 -201353 1092207 68 338 -201350 160750 69 338 -201354 8.10 70 338 -201338 ENS01 71 338 -201334 51 77 338 -201361 16087.71 78 338 -201333 homo_sapiens_core_104_38 79 338 -201384 Ensembl 84 338 -201385 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 338 -201383 ensembl 169 338 -201386 181123 170 338 -203926 GCA_018473295.1 1 347 -203941 1 2 347 -203940 435 3 347 -203943 30474809 4 347 -203925 2021-05 5 347 -203927 HG03540.pri.mat.f1_v2 6 347 -203939 40.85 7 347 -203936 scaffold 9 347 -203928 HG03540.pri.mat.f1_v2 11 347 -203944 1 14 347 -203935 NULL 15 347 -203945 435 16 347 -203937 34126066 17 347 -203942 26 18 347 -203938 3048418776 19 347 -203931 high 74 347 -203929 NCBI 75 347 -203930 INSDC Assembly ID 76 347 -203878 1164.42 21 348 -203884 7.80 22 348 -203887 7.80 23 348 -203881 149.21 24 348 -203890 247.18 25 348 -203874 66207.16 26 348 -203879 6200.13 27 348 -203882 3480.85 28 348 -203877 19790 29 348 -203883 104422 30 348 -203891 5.28 31 348 -203866 2022-08 33 348 -203871 2022-08 34 348 -203870 toplevel 35 348 -203880 2475741 36 348 -203869 projection_build 37 348 -203872 Mapping from reference 38 348 -203903 3.30 39 348 -203897 349.54 40 348 -203893 21751.23 41 348 -203905 1068.89 42 348 -203898 1375683 43 348 -203902 17074 44 348 -203904 2310 45 348 -203901 24445 46 348 -203900 4 47 348 -203895 5061 48 348 -203892 126140 49 348 -203894 54797 50 348 -203899 2.24 51 348 -203909 2.36 52 348 -203916 355.91 53 348 -203913 4157.09 54 348 -203915 4334.90 55 348 -203907 810.69 56 348 -203911 909713 57 348 -203914 15669 58 348 -203906 2 59 348 -203912 45082 60 348 -203908 26011 61 348 -203910 19071 62 348 -203917 1.22 63 348 -203876 8 64 348 -203865 2022-08-Ensembl 65 348 -203886 814831 66 348 -203875 1248838 67 348 -203888 1088642 68 348 -203885 160196 69 348 -203889 8.09 70 348 -203873 ENS01 71 348 -203868 51 77 348 -203896 16065.48 78 348 -203867 homo_sapiens_core_104_38 79 348 -203919 Ensembl 84 348 -203920 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 348 -203918 ensembl 169 348 -203921 180937 170 348 -199109 GCA_000146045.2 1 401 -199126 16 2 401 -199125 16 3 401 -199111 2011-09 5 401 -199112 R64-1-1 6 401 -199124 38.15 7 401 -199131 1 8 401 -199119 complete genome 9 401 -199114 chromosome:R64-1-1#contig 10 401 -199115 R64-1-1 11 401 -199117 SGD 12 401 -199118 http://www.yeastgenome.org/ 13 401 -199130 16 16 401 -199122 8762001 17 401 -199123 12071326 19 401 -199120 sacCer3 20 401 -199110 high 74 401 -199113 Saccharomyces cerevisiae S288c assembly from Saccharomyces Genome Database 80 401 -199133 R64-1-1 81 401 -199116 2 83 401 -199061 1327.58 21 402 -199067 1.05 22 402 -199070 1.05 23 402 -199064 1267.47 24 402 -199073 1267.47 25 402 -199057 1344.37 26 402 -199062 354.08 27 402 -199065 1327.58 28 402 -199060 6600 29 402 -199066 6600 30 402 -199074 1.00 31 402 -199051 2018-10 33 402 -199052 2018-10 34 402 -199053 toplevel 35 402 -199063 14733 36 402 -199054 import 37 402 -199101 Import 38 402 -199086 1.16 39 402 -199080 171.64 40 402 -199076 208.59 41 402 -199088 198.77 42 402 -199081 5947 43 402 -199084 424 46 402 -199083 58 47 402 -199078 424 48 402 -199075 67 49 402 -199077 424 50 402 -199082 1.00 51 402 -199092 1.00 52 402 -199099 863.25 53 402 -199096 863.25 54 402 -199090 863.25 56 402 -199094 3147 57 402 -199097 12 58 402 -199089 228 59 402 -199095 12 60 402 -199093 12 62 402 -199100 1.00 63 402 -199059 51 64 402 -199055 2017-01-SGD 65 402 -199069 6913 66 402 -199058 6913 67 402 -199071 313 68 402 -199068 6600 69 402 -199072 1.00 70 402 -199056 EXT01 71 402 -199106 YDL168W 72 402 -199107 VII:786054-786920 73 402 -199050 1 77 402 -199079 62.18 78 402 -199103 SGD 84 402 -199104 https://yeastgenome.org/ 85 402 -199102 community 169 402 -199105 491 170 402 -205458 GCA_000002985.3 1 405 -205470 6 2 405 -205469 3267 3 405 -205475 17493829 4 405 -205457 2012-12 5 405 -205455 WBcel235 6 405 -205468 35.44 7 405 -205476 1 8 405 -205463 complete genome 9 405 -205453 chromosome:WBcel235#chromosome:WBcel215 10 405 -205456 WBcel235 11 405 -205461 WormBase 12 405 -205462 http://www.wormbase.org 13 405 -205474 6 16 405 -205466 24569601 17 405 -205467 100272607 19 405 -205454 high 74 405 -205459 NCBI 75 405 -205460 Genome Assembly ID 76 405 -205472 WBcel235 81 405 -205405 1412.42 21 406 -205411 6.77 22 406 -205414 7.02 23 406 -205408 208.51 24 406 -205417 237.67 25 406 -205401 3224.91 26 406 -205406 397.55 27 406 -205409 1447.28 28 406 -205404 19985 29 406 -205410 31865 30 406 -205418 1.59 31 406 -205395 2014-10 33 406 -205396 2014-10 34 406 -205399 toplevel 35 406 -205407 102756 36 406 -205397 import 37 406 -205445 Import 38 406 -205430 1.02 39 406 -205424 73.00 40 406 -205420 82.61 41 406 -205432 75.03 42 406 -205425 14770 43 406 -205429 294 44 406 -205428 24813 46 406 -205427 17 47 406 -205422 24519 48 406 -205419 526 49 406 -205421 25311 50 406 -205426 1.02 51 406 -205436 4.10 52 406 -205443 228.89 53 406 -205440 1521.23 54 406 -205442 196.66 55 406 -205434 931.43 56 406 -205438 17899 57 406 -205441 2128 58 406 -205433 63 59 406 -205439 8869 60 406 -205435 6704 61 406 -205437 2165 62 406 -205444 1.02 63 406 -205403 30 64 406 -205394 2022-01-WormBase 65 406 -205413 215849 66 406 -205402 228411 67 406 -205415 195887 68 406 -205412 32524 69 406 -205416 1.63 70 406 -205400 EXT01 71 406 -205450 WBGene00004893 72 406 -205451 X:937766-957832 73 406 -205398 10 77 406 -205423 385.30 78 406 -205447 Wormbase 84 406 -205448 https://wormbase.org/ 85 406 -205446 wormbase 169 406 -205449 25837 170 406 -187207 1165.04 21 888 -187213 7.80 22 888 -187216 7.80 23 888 -187210 149.33 24 888 -187219 247.32 25 888 -187203 66396.37 26 888 -187208 6207.11 27 888 -187211 3483.73 28 888 -187206 19817 29 888 -187212 104548 30 888 -187220 5.28 31 888 -187195 2022-08 33 888 -187200 2022-08 34 888 -187199 toplevel 35 888 -187209 2474345 36 888 -187198 projection_build 37 888 -187201 Mapping from reference 38 888 -187232 3.29 39 888 -187226 351.62 40 888 -187222 21667.77 41 888 -187234 1072.40 42 888 -187227 1375744 43 888 -187231 17147 44 888 -187233 2317 45 888 -187230 24486 46 888 -187229 4 47 888 -187224 5022 48 888 -187221 124766 49 888 -187223 54475 50 888 -187228 2.22 51 888 -187238 2.38 52 888 -187245 354.81 53 888 -187242 4206.69 54 888 -187244 4322.62 55 888 -187236 812.34 56 888 -187240 909639 57 888 -187243 15765 58 888 -187235 1 59 888 -187241 45760 60 888 -187237 26562 61 888 -187239 19198 62 888 -187246 1.22 63 888 -187205 8 64 888 -187194 2022-08-Ensembl 65 888 -187215 815604 66 888 -187204 1249855 67 888 -187217 1089520 68 888 -187214 160335 69 888 -187218 8.09 70 888 -187202 ENS01 71 888 -187197 51 77 888 -187225 16165.01 78 888 -187196 homo_sapiens_core_104_38 79 888 -187248 Ensembl 84 888 -187249 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 888 -187247 ensembl 169 888 -187250 179241 170 888 -187289 1164.93 21 890 -187295 7.80 22 890 -187298 7.79 23 890 -187292 149.32 24 890 -187301 247.26 25 890 -187285 65822.11 26 890 -187290 6207.47 27 890 -187293 3468.98 28 890 -187288 20001 29 890 -187294 104773 30 890 -187302 5.24 31 890 -187277 2022-08 33 890 -187278 2022-07 34 890 -187283 toplevel 35 890 -187291 2476271 36 890 -187281 projection_build 37 890 -187282 Mapping from reference 38 890 -187314 3.30 39 890 -187308 350.41 40 890 -187304 21716.60 41 890 -187316 1073.22 42 890 -187309 1375866 43 890 -187313 17159 44 890 -187315 2302 45 890 -187312 24520 46 890 -187311 4 47 890 -187306 5059 48 890 -187303 126332 49 890 -187305 54889 50 890 -187310 2.24 51 890 -187320 2.38 52 890 -187327 352.72 53 890 -187324 4193.05 54 890 -187326 4272.46 55 890 -187318 807.20 56 890 -187322 909729 57 890 -187325 15791 58 890 -187317 2 59 890 -187323 45913 60 890 -187319 26643 61 890 -187321 19270 62 890 -187328 1.22 63 890 -187287 8 64 890 -187276 2022-08-Ensembl 65 890 -187297 817342 66 890 -187286 1251612 67 890 -187299 1091038 68 890 -187296 160574 69 890 -187300 8.03 70 890 -187284 ENS01 71 890 -187280 32 77 890 -187307 16089.73 78 890 -187279 homo_sapiens_core_104_38 79 890 -187330 Ensembl 84 890 -187331 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 890 -187329 ensembl 169 890 -187332 181221 170 890 -187371 1164.48 21 892 -187377 7.80 22 892 -187380 7.80 23 892 -187374 149.23 24 892 -187383 247.22 25 892 -187367 66405.72 26 892 -187372 6215.09 27 892 -187375 3481.03 28 892 -187370 19835 29 892 -187376 104562 30 892 -187384 5.27 31 892 -187359 2022-08 33 892 -187360 2022-07 34 892 -187365 toplevel 35 892 -187373 2475770 36 892 -187363 projection_build 37 892 -187364 Mapping from reference 38 892 -187396 3.30 39 892 -187390 350.51 40 892 -187386 21697.76 41 892 -187398 1069.47 42 892 -187391 1376351 43 892 -187395 17134 44 892 -187397 2291 45 892 -187394 24493 46 892 -187393 4 47 892 -187388 5068 48 892 -187385 125675 49 892 -187387 54657 50 892 -187392 2.23 51 892 -187402 2.39 52 892 -187409 352.93 53 892 -187406 4189.66 54 892 -187408 4303.41 55 892 -187400 811.77 56 892 -187404 909548 57 892 -187407 15543 58 892 -187399 1 59 892 -187405 45502 60 892 -187401 26498 61 892 -187403 19004 62 892 -187410 1.22 63 892 -187369 8 64 892 -187358 2022-08-Ensembl 65 892 -187379 815819 66 892 -187368 1250252 67 892 -187381 1089868 68 892 -187378 160384 69 892 -187382 8.09 70 892 -187366 ENS01 71 892 -187362 32 77 892 -187389 16057.02 78 892 -187361 homo_sapiens_core_104_38 79 892 -187412 Ensembl 84 892 -187413 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 892 -187411 ensembl 169 892 -187414 180332 170 892 -187613 1163.59 21 896 -187619 7.80 22 896 -187622 7.80 23 896 -187616 149.14 24 896 -187625 247.20 25 896 -187609 66345.71 26 896 -187614 6214.71 27 896 -187617 3481.97 28 896 -187612 19831 29 896 -187618 104635 30 896 -187626 5.28 31 896 -187601 2022-08 33 896 -187602 2022-07 34 896 -187607 toplevel 35 896 -187615 2474673 36 896 -187605 projection_build 37 896 -187606 Mapping from reference 38 896 -187638 3.30 39 896 -187632 350.64 40 896 -187628 21672.94 41 896 -187640 1071.34 42 896 -187633 1374470 43 896 -187637 17139 44 896 -187639 2316 45 896 -187636 24538 46 896 -187635 4 47 896 -187630 5083 48 896 -187627 126098 49 896 -187629 54765 50 896 -187634 2.23 51 896 -187644 2.38 52 896 -187651 355.44 53 896 -187648 4139.03 54 896 -187650 4279.29 55 896 -187642 814.27 56 896 -187646 909485 57 896 -187649 15732 58 896 -187641 9 59 896 -187647 45635 60 896 -187643 26437 61 896 -187645 19198 62 896 -187652 1.22 63 896 -187611 8 64 896 -187600 2022-08-Ensembl 65 896 -187621 816271 66 896 -187610 1251076 67 896 -187623 1090597 68 896 -187620 160479 69 896 -187624 8.09 70 896 -187608 ENS01 71 896 -187604 32 77 896 -187631 16101.24 78 896 -187603 homo_sapiens_core_104_38 79 896 -187654 Ensembl 84 896 -187655 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 896 -187653 ensembl 169 896 -187656 180863 170 896 -192722 1163.54 21 1006 -192728 7.80 22 1006 -192731 7.79 23 1006 -192725 149.24 24 1006 -192734 247.19 25 1006 -192718 66258.12 26 1006 -192723 6209.25 27 1006 -192726 3474.00 28 1006 -192721 19841 29 1006 -192727 104668 30 1006 -192735 5.28 31 1006 -192710 2022-08 33 1006 -192711 2022-07 34 1006 -192716 toplevel 35 1006 -192724 2475543 36 1006 -192714 projection_build 37 1006 -192715 Mapping from reference 38 1006 -192747 3.31 39 1006 -192741 350.44 40 1006 -192737 21712.60 41 1006 -192749 1073.34 42 1006 -192742 1377539 43 1006 -192746 17143 44 1006 -192748 2293 45 1006 -192745 24483 46 1006 -192744 4 47 1006 -192739 5047 48 1006 -192736 126127 49 1006 -192738 54685 50 1006 -192743 2.23 51 1006 -192753 2.38 52 1006 -192760 354.98 53 1006 -192757 4115.89 54 1006 -192759 4263.13 55 1006 -192751 814.46 56 1006 -192755 909556 57 1006 -192758 15721 58 1006 -192750 2 59 1006 -192756 45786 60 1006 -192752 26571 61 1006 -192754 19215 62 1006 -192761 1.22 63 1006 -192720 8 64 1006 -192709 2022-08-Ensembl 65 1006 -192730 815962 66 1006 -192719 1250738 67 1006 -192732 1090226 68 1006 -192729 160512 69 1006 -192733 8.09 70 1006 -192717 ENS01 71 1006 -192713 32 77 1006 -192740 16067.69 78 1006 -192712 homo_sapiens_core_104_38 79 1006 -192763 Ensembl 84 1006 -192764 https://rapid.ensembl.org/info/genome/genebuild/full_genebuild.html 85 1006 -192762 ensembl 169 1006 -192765 180812 170 1006 -26649 1047716160 119 1391 -26650 1:230710048:rs699 120 1391 -26651 1042639434 119 1392 -26652 1:230845794:rs699 120 1392 -26653 184169627 123 1393 -211360 23/05/26 182 1393 -211359 23/06/01 183 1393 -26655 161869381 123 1394 -211780 23/06/14 182 1394 -211779 23/06/06 183 1394 -211686 23/05/26 182 1405 -211685 23/06/12 183 1405 -211722 23/06/14 182 1414 -211721 23/06/02 183 1414 -211484 23/06/11 182 1421 -211483 23/06/10 183 1421 -26739 1056117383 119 1464 -26740 JAGYYS010000001.1:2233547:rs1453994370 120 1464 -26751 1057589994 119 1470 -26752 JAHAOO010000001.1:9172:rs765031495 120 1470 -26761 1054338571 119 1475 -26762 JAGYVY010000001.1:710982:rs1737833592 120 1475 -26793 1056666201 119 1491 -26794 JAHAON010000001.1:31416:rs1714786049 120 1491 -26799 1045449578 119 1494 -26800 JAHEPF010000001.1:187007:rs1378027620 120 1494 -26803 1053740562 119 1496 -26804 JAHEPG010000001.1:5092:rs1002974720 120 1496 -26825 1054887454 119 1507 -26826 JAGYYT010000001.1:2643538:rs1423484253 120 1507 -26829 1044482335 119 1509 -26830 JAGYVX010000001.1:28459:rs1278014195 120 1509 -26867 260526 119 1528 -26868 I:356:s01-356 120 1528 -26885 156317 119 1537 -26886 1:98866:rs3166780949 120 1537 -26899 25626842 119 1544 -26900 1A:58609:1A_58609 120 1544 -28123 91.2 161 2276 -28122 Pan troglodytes 162 2276 -28153 88.9 161 2291 -28152 Pan troglodytes 162 2291 -28209 91.3 161 2319 -28208 Pan troglodytes 162 2319 -28267 16.2 161 2348 -28266 Saccharomyces cerevisiae S288c 162 2348 -28285 27.4 161 2357 -28284 Rattus norvegicus 162 2357 -28339 47.3 161 2384 -28338 Triticum turgidum subsp. durum 162 2384 -28359 19.1 161 2394 -28358 Drosophila melanogaster 162 2394 -28387 90.6 161 2408 -28386 Pan troglodytes 162 2408 -28469 91.6 161 2449 -28468 Pan troglodytes 162 2449 -28479 91.5 161 2454 -28478 Pan troglodytes 162 2454 -28487 10.3 161 2458 -28486 Saccharomyces cerevisiae S288c 162 2458 -28535 91.4 161 2482 -28534 Pan troglodytes 162 2482 -28559 91.3 161 2494 -28558 Pan troglodytes 162 2494 -28609 110623 163 2518 -28610 36597 164 2518 -28611 268483 165 2518 -28612 101734 166 2518 -28613 30873 167 2518 -206984 91.4 161 6593 -206983 Pan troglodytes 162 6593 -207044 88.9 161 6623 -207043 Pan troglodytes 162 6623 -207196 22.7 161 6699 -207195 Paramecium tetraurelia 162 6699 -207496 91.0 161 6849 -207495 Pan troglodytes 162 6849 -207590 91.2 161 6896 -207589 Pan troglodytes 162 6896 -207936 29.5 161 7069 -207935 Strongyloides ratti 162 7069 -208152 91.6 161 7177 -208151 Pan troglodytes 162 7177 -208438 91.0 161 7320 -208437 Pan troglodytes 162 7320 -208868 90.6 161 7535 -208867 Pan troglodytes 162 7535 -209004 70.5 161 7603 -209003 Oryza sativa 162 7603 -209368 91.3 161 7785 -209367 Pan troglodytes 162 7785 -209438 91.2 161 7820 -209437 Pan troglodytes 162 7820 -209492 40.0 161 7847 -209491 Zymoseptoria tritici 162 7847 -210058 91.5 161 8130 -210057 Pan troglodytes 162 8130 -210582 10.3 161 8392 -210581 Saccharomyces cerevisiae S288c 162 8392 -211120 91.3 161 8661 -211119 Pan troglodytes 162 8661 -211640 23/10/13 182 8662 -211639 23/09/29 183 8662 -211608 23/09/30 182 8663 -211607 23/10/16 183 8663 -211372 23/10/06 182 8664 -211371 23/10/13 183 8664 -211852 23/10/07 182 8665 -211851 23/10/01 183 8665 -211794 23/10/01 182 8666 -211793 23/10/03 183 8666 -211374 23/09/30 182 8667 -211373 23/10/08 183 8667 -211178 23/10/03 182 8668 -211177 23/10/16 183 8668 -211636 23/10/15 182 8669 -211635 23/10/17 183 8669 -211568 23/10/02 182 8670 -211567 23/10/14 183 8670 -211310 23/10/06 182 8671 -211309 23/10/12 183 8671 -211454 23/10/09 182 8672 -211453 23/10/17 183 8672 -211760 23/10/16 182 8673 -211759 23/10/17 183 8673 -211224 23/09/30 182 8674 -211223 23/10/07 183 8674 -211864 23/10/07 182 8675 -211863 23/10/10 183 8675 -211440 23/10/04 182 8676 -211439 23/09/30 183 8676 -211380 23/09/30 182 8677 -211379 23/10/10 183 8677 -211514 23/10/17 182 8678 -211513 23/09/29 183 8678 -211180 23/10/16 182 8679 -211179 23/09/29 183 8679 -211766 23/10/01 182 8680 -211765 23/10/02 183 8680 -211660 23/10/09 182 8681 -211659 23/10/14 183 8681 -211542 23/10/06 182 8682 -211541 23/10/06 183 8682 -211266 23/10/16 182 8683 -211265 23/09/28 183 8683 -211712 23/09/30 182 8684 -211711 23/10/17 183 8684 -211572 23/10/12 182 8685 -211571 23/09/29 183 8685 -211288 23/10/06 182 8686 -211287 23/10/14 183 8686 -211512 23/10/15 182 8687 -211511 23/10/16 183 8687 -211194 23/10/09 182 8688 -211193 23/10/10 183 8688 -211352 23/10/14 182 8689 -211351 23/10/09 183 8689 -211252 23/10/14 182 8690 -211251 23/10/08 183 8690 -211336 23/09/30 182 8691 -211335 23/10/06 183 8691 -211294 23/10/06 182 8692 -211293 23/10/10 183 8692 -211828 23/10/16 182 8693 -211827 23/10/17 183 8693 -211480 23/10/11 182 8694 -211479 23/10/13 183 8694 -211432 23/10/09 182 8695 -211431 23/10/10 183 8695 -211400 23/10/16 182 8696 -211399 23/09/28 183 8696 -211734 23/10/15 182 8697 -211733 23/10/14 183 8697 -211496 23/10/10 182 8698 -211495 23/09/28 183 8698 -211366 23/10/04 182 8699 -211365 23/10/02 183 8699 -211282 23/10/03 182 8700 -211281 23/10/18 183 8700 -211398 23/10/12 182 8701 -211397 23/10/06 183 8701 -211264 23/10/10 182 8702 -211263 23/10/14 183 8702 -211832 23/10/13 182 8703 -211831 23/10/17 183 8703 -211798 23/09/28 182 8704 -211797 23/09/30 183 8704 -211570 23/10/15 182 8705 -211569 23/10/09 183 8705 -211140 23/10/01 182 8706 -211139 23/10/02 183 8706 -211420 23/10/16 182 8707 -211419 23/10/04 183 8707 -211682 23/10/09 182 8708 -211681 23/10/08 183 8708 -211328 23/10/14 182 8709 -211327 23/10/12 183 8709 -211460 23/09/27 182 8710 -211459 23/10/04 183 8710 -211596 23/10/16 182 8711 -211595 23/09/29 183 8711 -211758 23/09/29 182 8712 -211757 23/10/06 183 8712 -211536 23/10/08 182 8713 -211535 23/10/16 183 8713 -211786 23/10/01 182 8714 -211785 23/10/08 183 8714 -211318 23/10/06 182 8715 -211317 23/10/04 183 8715 -211698 23/10/06 182 8716 -211697 23/10/16 183 8716 -211408 23/10/16 182 8717 -211407 23/10/12 183 8717 -211688 23/10/15 182 8718 -211687 23/10/15 183 8718 -211870 23/10/06 182 8719 -211869 23/10/18 183 8719 -211662 23/10/09 182 8720 -211661 23/10/17 183 8720 -211776 23/10/15 182 8721 -211775 23/10/14 183 8721 -211446 23/10/09 182 8722 -211445 23/10/10 183 8722 -211424 23/10/03 182 8723 -211423 23/10/03 183 8723 -211168 23/10/01 182 8724 -211167 23/10/17 183 8724 -211246 23/10/02 182 8725 -211245 23/10/15 183 8725 -211410 23/10/17 182 8726 -211409 23/10/15 183 8726 -211298 23/10/05 182 8727 -211297 23/10/14 183 8727 -211482 23/10/01 182 8728 -211481 23/10/10 183 8728 -211172 23/10/12 182 8729 -211171 23/09/29 183 8729 -211436 23/10/04 182 8730 -211435 23/10/15 183 8730 -211212 24/04/21 182 8731 -211211 24/04/18 183 8731 -211748 24/04/18 182 8732 -211747 24/04/19 183 8732 -211364 24/04/22 182 8733 -211363 24/04/22 183 8733 -211478 24/04/23 182 8734 -211477 24/04/18 183 8734 -211488 24/04/20 182 8735 -211487 24/04/21 183 8735 -211506 24/04/19 182 8736 -211505 24/04/21 183 8736 -211376 24/04/23 182 8737 -211375 24/04/18 183 8737 -211236 24/04/22 182 8738 -211235 24/04/22 183 8738 -211308 24/04/21 182 8739 -211307 24/04/23 183 8739 -211718 24/04/19 182 8740 -211717 24/04/22 183 8740 -211738 24/04/22 182 8741 -211737 24/04/19 183 8741 -211554 24/04/19 182 8742 -211553 24/04/20 183 8742 -211620 24/04/20 182 8743 -211619 24/04/21 183 8743 -211610 24/04/21 182 8744 -211609 24/04/19 183 8744 -211182 24/04/19 182 8745 -211181 24/04/22 183 8745 -211528 24/04/20 182 8746 -211527 24/04/22 183 8746 -211770 24/04/22 182 8747 -211769 24/04/20 183 8747 -211486 24/04/21 182 8748 -211485 24/04/21 183 8748 -211588 24/04/19 182 8749 -211587 24/04/18 183 8749 -211752 24/04/21 182 8750 -211751 24/04/23 183 8750 -211234 24/04/17 182 8751 -211233 24/04/21 183 8751 -211270 24/04/21 182 8752 -211269 24/04/20 183 8752 -211810 24/04/17 182 8753 -211809 24/04/20 183 8753 -211650 24/04/21 182 8754 -211649 24/04/23 183 8754 -211594 24/04/18 182 8755 -211593 24/04/23 183 8755 -211730 24/04/18 182 8756 -211729 24/04/18 183 8756 -211474 24/04/22 182 8757 -211473 24/04/21 183 8757 -211330 24/04/22 182 8758 -211329 24/04/21 183 8758 -211490 24/04/19 182 8759 -211489 24/04/20 183 8759 -211802 24/04/21 182 8760 -211801 24/04/20 183 8760 -211134 24/04/21 182 8761 -211133 24/04/21 183 8761 -211136 24/04/22 182 8762 -211135 24/04/21 183 8762 -211558 24/04/21 182 8763 -211557 24/04/22 183 8763 -211186 24/04/21 182 8764 -211185 24/04/20 183 8764 -211576 24/04/22 182 8765 -211575 24/04/19 183 8765 -211148 24/04/22 182 8766 -211147 24/04/23 183 8766 -211316 24/04/20 182 8767 -211315 24/04/21 183 8767 -211622 24/04/23 182 8768 -211621 24/04/21 183 8768 -211764 24/04/23 182 8769 -211763 24/04/23 183 8769 -211452 24/04/22 182 8770 -211451 24/04/22 183 8770 -211732 24/04/18 182 8771 -211731 24/04/21 183 8771 -211830 24/04/19 182 8772 -211829 24/04/21 183 8772 -211190 24/04/19 182 8773 -211189 24/04/20 183 8773 -211476 24/04/23 182 8774 -211475 24/04/21 183 8774 -211124 24/04/18 182 8775 -211123 24/04/19 183 8775 -211772 24/04/19 182 8776 -211771 24/04/24 183 8776 -211534 24/04/21 182 8777 -211533 24/04/21 183 8777 -211860 24/04/19 182 8778 -211859 24/04/20 183 8778 -211834 24/04/19 182 8779 -211833 24/04/21 183 8779 -211302 24/04/22 182 8780 -211301 24/04/22 183 8780 -211394 24/04/20 182 8781 -211393 24/04/23 183 8781 -211346 24/04/23 182 8782 -211345 24/04/24 183 8782 -211314 24/04/20 182 8783 -211313 24/04/23 183 8783 -211492 24/04/20 182 8784 -211491 24/04/23 183 8784 -211426 24/04/18 182 8785 -211425 24/04/21 183 8785 -211532 24/04/18 182 8786 -211531 24/04/18 183 8786 -211634 24/04/18 182 8787 -211633 24/04/19 183 8787 -211340 24/04/18 182 8788 -211339 24/04/19 183 8788 -211564 24/04/22 182 8789 -211563 24/04/19 183 8789 -211164 24/04/19 182 8790 -211163 24/04/19 183 8790 -211784 24/04/21 182 8791 -211783 24/04/19 183 8791 -211130 24/04/18 182 8792 -211129 24/04/22 183 8792 -211324 24/04/20 182 8793 -211323 24/04/20 183 8793 -211840 24/04/22 182 8794 -211839 24/04/21 183 8794 -211284 24/04/22 182 8795 -211283 24/04/18 183 8795 -211384 24/04/18 182 8796 -211383 24/04/23 183 8796 -211156 24/04/19 182 8797 -211155 24/04/19 183 8797 -211796 24/04/17 182 8798 -211795 24/04/19 183 8798 -211150 24/04/20 182 8799 -211149 24/04/22 183 8799 -211658 24/04/19 182 8800 -211657 24/04/20 183 8800 -211342 24/04/22 182 8801 -211341 24/04/20 183 8801 -211582 24/04/21 182 8802 -211581 24/04/20 183 8802 -211204 24/04/23 182 8803 -211203 24/04/21 183 8803 -211158 24/04/18 182 8804 -211157 24/04/19 183 8804 -211304 24/04/20 182 8805 -211303 24/04/22 183 8805 -211226 24/04/17 182 8806 -211225 24/04/22 183 8806 -211222 24/04/19 182 8807 -211221 24/04/23 183 8807 -211332 24/04/18 182 8808 -211331 24/04/20 183 8808 -211208 24/04/18 182 8809 -211207 24/04/21 183 8809 -211312 24/04/19 182 8810 -211311 24/04/19 183 8810 -211404 24/04/21 182 8811 -211403 24/04/22 183 8811 -211338 24/04/22 182 8812 -211337 24/04/21 183 8812 -211598 24/04/23 182 8813 -211597 24/04/24 183 8813 -211726 24/04/22 182 8814 -211725 24/04/22 183 8814 -211538 24/04/20 182 8815 -211537 24/04/21 183 8815 -211674 24/04/20 182 8816 -211673 24/04/18 183 8816 -211854 24/04/21 182 8817 -211853 24/04/23 183 8817 -211842 24/04/22 182 8818 -211841 24/04/20 183 8818 -211368 24/04/22 182 8819 -211367 24/04/24 183 8819 -211216 24/04/21 182 8820 -211215 24/04/24 183 8820 -211152 24/04/19 182 8821 -211151 24/04/22 183 8821 -211126 24/04/20 182 8822 -211125 24/04/22 183 8822 -211720 24/04/19 182 8823 -211719 24/04/19 183 8823 -211812 24/04/18 182 8824 -211811 24/04/21 183 8824 -211836 24/04/22 182 8825 -211835 24/04/22 183 8825 -211260 24/04/19 182 8826 -211259 24/04/20 183 8826 -211642 24/04/20 182 8827 -211641 24/04/22 183 8827 -211846 24/04/17 182 8828 -211845 24/04/18 183 8828 -211502 24/04/21 182 8829 -211501 24/04/21 183 8829 -211286 24/04/20 182 8830 -211285 24/04/20 183 8830 -211788 24/04/22 182 8831 -211787 24/04/18 183 8831 -211122 24/04/23 182 8832 -211121 24/04/21 183 8832 -211604 24/04/19 182 8833 -211603 24/04/23 183 8833 -211550 24/04/22 182 8834 -211549 24/04/20 183 8834 -211590 24/04/20 182 8835 -211589 24/04/21 183 8835 -211866 24/04/21 182 8836 -211865 24/04/19 183 8836 -211668 24/04/21 182 8837 -211667 24/04/22 183 8837 -211200 24/04/21 182 8838 -211199 24/04/22 183 8838 -211790 24/04/22 182 8839 -211789 24/04/21 183 8839 -211504 24/04/19 182 8840 -211503 24/04/18 183 8840 -211700 24/04/21 182 8841 -211699 24/04/19 183 8841 -211600 24/04/21 182 8842 -211599 24/04/21 183 8842 -211806 24/04/22 182 8843 -211805 24/04/20 183 8843 -211292 24/04/21 182 8844 -211291 24/04/19 183 8844 -211676 24/04/18 182 8845 -211675 24/04/24 183 8845 -211696 24/04/20 182 8846 -211695 24/04/21 183 8846 -211470 24/04/22 182 8847 -211469 24/04/23 183 8847 -211296 24/04/21 182 8848 -211295 24/04/21 183 8848 -211326 24/04/20 182 8849 -211325 24/04/19 183 8849 -211170 24/04/19 182 8850 -211169 24/04/22 183 8850 -211278 24/04/17 182 8851 -211277 24/04/20 183 8851 -211814 24/04/18 182 8852 -211813 24/04/23 183 8852 -211592 24/04/22 182 8853 -211591 24/04/21 183 8853 -211358 24/04/23 182 8854 -211357 24/04/23 183 8854 -211166 24/04/19 182 8855 -211165 24/04/23 183 8855 -211174 24/04/20 182 8856 -211173 24/04/18 183 8856 -211518 24/04/21 182 8857 -211517 24/04/23 183 8857 -211196 24/04/19 182 8858 -211195 24/04/24 183 8858 -211198 24/04/21 182 8859 -211197 24/04/23 183 8859 -211230 24/04/17 182 8860 -211229 24/04/20 183 8860 -211258 24/04/18 182 8861 -211257 24/04/22 183 8861 -211858 24/04/19 182 8862 -211857 24/04/22 183 8862 -211616 24/04/18 182 8863 -211615 24/04/20 183 8863 -211472 24/04/21 182 8864 -211471 24/04/23 183 8864 -211820 24/04/17 182 8865 -211819 24/04/19 183 8865 -211192 24/04/19 182 8866 -211191 24/04/21 183 8866 -211750 24/04/21 182 8867 -211749 24/04/21 183 8867 -211666 24/04/18 182 8868 -211665 24/04/20 183 8868 -211450 24/04/19 182 8869 -211449 24/04/23 183 8869 -211250 24/04/17 182 8870 -211249 24/04/24 183 8870 -211652 24/04/20 182 8871 -211651 24/04/19 183 8871 -211188 24/04/21 182 8872 -211187 24/04/23 183 8872 -211740 24/04/20 182 8873 -211739 24/04/22 183 8873 -211176 24/04/23 182 8874 -211175 24/04/20 183 8874 -211804 24/04/20 182 8875 -211803 24/04/21 183 8875 -211464 24/04/22 182 8876 -211463 24/04/18 183 8876 -211808 24/04/21 182 8877 -211807 24/04/20 183 8877 -211456 23/06/04 182 8878 -211455 23/06/15 183 8878 -211228 23/06/12 182 8879 -211227 23/06/01 183 8879 -211614 23/06/01 182 8880 -211613 23/06/15 183 8880 -211370 23/05/30 182 8881 -211369 23/06/07 183 8881 -211526 23/05/31 182 8882 -211525 23/06/10 183 8882 -211466 23/06/02 182 8883 -211465 23/05/27 183 8883 -211354 23/06/07 182 8884 -211353 23/05/28 183 8884 -211458 23/05/30 182 8885 -211457 23/05/26 183 8885 -211422 23/05/31 182 8886 -211421 23/06/06 183 8886 -211220 23/06/03 182 8887 -211219 23/05/28 183 8887 -211272 23/06/11 182 8888 -211271 23/06/09 183 8888 -211856 23/06/13 182 8889 -211855 23/06/05 183 8889 -211248 23/06/08 182 8890 -211247 23/06/12 183 8890 -211386 23/06/10 182 8891 -211385 23/06/03 183 8891 -211382 23/05/29 182 8892 -211381 23/06/03 183 8892 -211276 23/06/05 182 8893 -211275 23/05/30 183 8893 -211444 23/06/01 182 8894 -211443 23/05/28 183 8894 -211552 23/05/27 182 8895 -211551 23/05/26 183 8895 -211238 23/05/26 182 8896 -211237 23/05/30 183 8896 -211244 23/05/29 182 8897 -211243 23/06/14 183 8897 -211690 23/05/26 182 8898 -211689 23/06/05 183 8898 -211704 23/05/26 182 8899 -211703 23/06/10 183 8899 -211214 23/06/08 182 8900 -211213 23/05/28 183 8900 -211754 23/05/29 182 8901 -211753 23/06/07 183 8901 -211414 23/06/13 182 8902 -211413 23/06/06 183 8902 -211868 23/05/27 182 8903 -211867 23/06/03 183 8903 -211510 23/05/27 182 8904 -211509 23/06/04 183 8904 -211254 23/06/10 182 8905 -211253 23/06/03 183 8905 -211630 23/06/01 182 8906 -211629 23/06/04 183 8906 -211850 23/06/02 182 8907 -211849 23/05/29 183 8907 -211146 23/05/30 182 8908 -211145 23/06/08 183 8908 -211348 23/06/01 182 8909 -211347 23/06/15 183 8909 -211826 23/06/04 182 8910 -211825 23/06/10 183 8910 -211580 23/06/03 182 8911 -211579 23/06/01 183 8911 -211210 23/06/01 182 8912 -211209 23/06/04 183 8912 -211824 23/06/12 182 8913 -211823 23/06/04 183 8913 -211838 23/05/31 182 8914 -211837 23/06/04 183 8914 -211402 23/06/01 182 8915 -211401 23/06/08 183 8915 -211782 23/06/14 182 8916 -211781 23/06/09 183 8916 -211586 23/06/14 182 8917 -211585 23/05/30 183 8917 -211516 23/06/14 182 8918 -211515 23/06/04 183 8918 -211692 23/06/06 182 8919 -211691 23/06/07 183 8919 -211206 23/05/30 182 8920 -211205 23/06/04 183 8920 -211290 23/06/02 182 8921 -211289 23/06/05 183 8921 -211280 23/06/12 182 8922 -211279 23/05/26 183 8922 -211560 23/06/11 182 8923 -211559 23/06/06 183 8923 -211300 23/06/04 182 8924 -211299 23/06/11 183 8924 -211540 23/06/12 182 8925 -211539 23/06/05 183 8925 -211626 23/05/25 182 8926 -211625 23/06/06 183 8926 -211418 23/06/03 182 8927 -211417 23/05/26 183 8927 -211672 23/05/31 182 8928 -211671 23/05/30 183 8928 -211546 23/05/30 182 8929 -211545 23/06/05 183 8929 -211242 23/06/04 182 8930 -211241 23/05/31 183 8930 -211656 23/05/28 182 8931 -211655 23/05/29 183 8931 -211716 23/06/08 182 8932 -211715 23/06/04 183 8932 -211848 23/05/31 182 8933 -211847 23/05/31 183 8933 -211628 23/06/11 182 8934 -211627 23/06/10 183 8934 -211356 23/05/27 182 8935 -211355 23/06/13 183 8935 -211844 23/06/09 182 8936 -211843 23/05/31 183 8936 -211268 23/05/28 182 8937 -211267 23/06/02 183 8937 -211762 23/05/25 182 8938 -211761 23/05/28 183 8938 -211816 23/05/28 182 8939 -211815 23/05/29 183 8939 -211602 23/05/30 182 8940 -211601 23/06/12 183 8940 -211710 23/05/25 182 8941 -211709 23/05/28 183 8941 -211612 23/06/10 182 8942 -211611 23/05/31 183 8942 -211202 23/05/31 182 8943 -211201 23/05/30 183 8943 -211390 23/05/31 182 8944 -211389 23/06/01 183 8944 -211396 23/05/29 182 8945 -211395 23/06/03 183 8945 -211344 23/05/29 182 8946 -211343 23/06/08 183 8946 -211508 23/05/25 182 8947 -211507 23/06/13 183 8947 -211160 23/06/04 182 8948 -211159 23/06/08 183 8948 -211742 23/06/12 182 8949 -211741 23/06/04 183 8949 -211462 23/05/29 182 8950 -211461 23/06/10 183 8950 -211434 23/06/09 182 8951 -211433 23/06/06 183 8951 -211442 23/06/03 182 8952 -211441 23/05/27 183 8952 -211406 23/06/07 182 8953 -211405 23/06/11 183 8953 -211468 23/05/28 182 8954 -211467 23/05/26 183 8954 -211378 23/06/09 182 8955 -211377 23/06/14 183 8955 -211684 23/06/06 182 8956 -211683 23/06/08 183 8956 -211618 23/06/03 182 8957 -211617 23/06/01 183 8957 -211724 23/05/28 182 8958 -211723 23/06/03 183 8958 -211274 23/06/13 182 8959 -211273 23/05/28 183 8959 -211708 23/05/31 182 8960 -211707 23/05/30 183 8960 -211184 23/06/01 182 8961 -211183 23/06/06 183 8961 -211262 23/05/28 182 8962 -211261 23/06/05 183 8962 -211430 23/06/08 182 8963 -211429 23/06/02 183 8963 -211392 23/06/09 182 8964 -211391 23/06/13 183 8964 -211494 23/06/07 182 8965 -211493 23/05/30 183 8965 -211644 23/06/03 182 8966 -211643 23/06/08 183 8966 -211412 23/05/31 182 8967 -211411 23/06/13 183 8967 -211154 23/05/30 182 8968 -211153 23/06/02 183 8968 -211500 23/06/02 182 8969 -211499 23/05/30 183 8969 -211544 23/06/04 182 8970 -211543 23/06/15 183 8970 -211548 23/06/10 182 8971 -211547 23/06/10 183 8971 -211648 23/06/05 182 8972 -211647 23/05/30 183 8972 -211334 23/05/27 182 8973 -211333 23/06/13 183 8973 -211862 23/06/01 182 8974 -211861 23/06/02 183 8974 -211350 23/05/27 182 8975 -211349 23/06/06 183 8975 -211800 23/06/02 182 8976 -211799 23/05/29 183 8976 -211256 23/05/25 182 8977 -211255 23/05/28 183 8977 -211792 23/05/30 182 8978 -211791 23/06/12 183 8978 -211530 23/06/02 182 8979 -211529 23/06/05 183 8979 -211680 23/06/02 182 8980 -211679 23/06/09 183 8980 -211362 23/05/26 182 8981 -211361 23/06/10 183 8981 -211632 23/06/09 182 8982 -211631 23/05/26 183 8982 -211670 23/05/27 182 8983 -211669 23/05/26 183 8983 -211756 23/05/29 182 8984 -211755 23/05/31 183 8984 -211522 23/06/10 182 8985 -211521 23/06/13 183 8985 -211388 23/06/02 182 8986 -211387 23/06/01 183 8986 -211320 23/06/14 182 8987 -211319 23/06/12 183 8987 -211144 23/06/11 182 8988 -211143 23/06/12 183 8988 -211728 23/06/04 182 8989 -211727 23/06/15 183 8989 -211746 23/06/13 182 8990 -211745 23/05/31 183 8990 -211566 23/05/25 182 8991 -211565 23/05/31 183 8991 -211322 23/06/01 182 8992 -211321 23/06/08 183 8992 -211624 23/05/27 182 8993 -211623 23/06/13 183 8993 -211706 23/06/04 182 8994 -211705 23/06/01 183 8994 -211138 23/06/13 182 8995 -211137 23/05/29 183 8995 -211562 23/06/13 182 8996 -211561 23/05/26 183 8996 -211702 23/06/07 182 8997 -211701 23/05/29 183 8997 -211694 23/06/11 182 8998 -211693 23/06/13 183 8998 -211584 23/05/31 182 8999 -211583 23/05/29 183 8999 -211822 23/05/27 182 9000 -211821 23/06/10 183 9000 -211778 23/05/29 182 9001 -211777 23/06/11 183 9001 -211240 23/06/08 182 9002 -211239 23/06/11 183 9002 -211556 23/05/26 182 9003 -211555 23/06/06 183 9003 -211736 23/05/30 182 9004 -211735 23/05/27 183 9004 -211714 23/05/28 182 9005 -211713 23/05/26 183 9005 -211128 23/06/10 182 9006 -211127 23/05/29 183 9006 -211654 23/06/11 182 9007 -211653 23/05/26 183 9007 -211606 23/05/29 182 9008 -211605 23/06/01 183 9008 -211774 23/05/25 182 9009 -211773 23/06/05 183 9009 -211574 23/05/30 182 9010 -211573 23/05/26 183 9010 -211132 23/05/25 182 9011 -211131 23/05/26 183 9011 -211162 23/06/02 182 9012 -211161 23/06/13 183 9012 -211438 23/06/12 182 9013 -211437 23/06/14 183 9013 -211646 23/06/14 182 9014 -211645 23/06/09 183 9014 -211744 23/06/13 182 9015 -211743 23/05/26 183 9015 -211232 23/06/05 182 9016 -211231 23/05/28 183 9016 -211306 23/06/02 182 9017 -211305 23/05/28 183 9017 -211218 23/06/09 182 9018 -211217 23/06/04 183 9018 -211524 23/05/29 182 9019 -211523 23/06/08 183 9019 -211416 23/05/27 182 9020 -211415 23/06/12 183 9020 -211678 23/06/10 182 9021 -211677 23/06/08 183 9021 -211520 23/06/12 182 9022 -211519 23/05/28 183 9022 -211818 23/06/13 182 9023 -211817 23/05/29 183 9023 -211448 23/05/26 182 9024 -211447 23/06/12 183 9024 -211638 23/05/29 182 9025 -211637 23/06/09 183 9025 -211768 23/06/08 182 9026 -211767 23/06/11 183 9026 -211664 23/05/29 182 9027 -211663 23/06/02 183 9027 -211428 23/05/31 182 9028 -211427 23/06/12 183 9028 -211142 23/06/04 182 9029 -211141 23/06/10 183 9029 -211578 23/05/29 182 9030 -211577 23/06/09 183 9030 -211498 23/06/06 182 9031 -211497 23/06/06 183 9031 -211871 ENS01 197 890 -211872 EXT01 197 406 diff --git a/src/tests/databases/ensembl_genome_metadata/dataset_source.txt b/src/tests/databases/ensembl_genome_metadata/dataset_source.txt deleted file mode 100644 index d481a244..00000000 --- a/src/tests/databases/ensembl_genome_metadata/dataset_source.txt +++ /dev/null @@ -1,53 +0,0 @@ -1 core bacteria_0_collection_core_57_110_1 -4 core homo_sapiens_gca018473315v1_core_110_1 -5 core homo_sapiens_gca018469415v1_core_110_1 -6 core homo_sapiens_gca018469875v1_core_110_1 -7 core homo_sapiens_gca018505825v1_core_110_1 -9 core homo_sapiens_gca018852615v1_core_110_1 -14 core plasmodium_falciparum_core_57_110_1 -18 core homo_sapiens_gca021950905v1_core_110_1 -40 core homo_sapiens_37_core_110_37 -79 core triticum_aestivum_core_57_110_4 -92 core homo_sapiens_core_110_38 -97 core homo_sapiens_gca018505865v1_core_110_1 -100 core homo_sapiens_gca018852605v1_core_110_1 -107 core homo_sapiens_gca018469925v1_core_110_1 -135 core homo_sapiens_gca018469425v1_core_110_1 -179 core homo_sapiens_gca021951015v1_core_110_1 -185 core homo_sapiens_gca018473295v1_core_110_1 -214 core saccharomyces_cerevisiae_core_57_110_4 -217 core caenorhabditis_elegans_core_57_110_282 -251 compara homo_sapiens_gca018473315v1_compara_110 -266 compara homo_sapiens_37_compara_110 -284 compara homo_sapiens_gca018469425v1_compara_110 -294 compara homo_sapiens_gca018469925v1_compara_110 -313 compara homo_sapiens_gca018505825v1_compara_110 -323 compara plasmodium_falciparum_compara_110 -332 compara saccharomyces_cerevisiae_compara_110 -359 compara triticum_aestivum_compara_110 -368 compara homo_sapiens_gca018852605v1_compara_110 -369 compara caenorhabditis_elegans_compara_110 -383 compara homo_sapiens_gca018469415v1_compara_110 -387 compara homo_sapiens_gca018469875v1_compara_110 -423 compara homo_sapiens_gca018505865v1_compara_110 -424 compara homo_sapiens_compara_110 -429 compara homo_sapiens_gca021950905v1_compara_110 -433 compara escherichia_coli_str_k_12_substr_mg1655_gca_000005845_compara_110 -443 compara homo_sapiens_gca018852615v1_compara_110 -457 compara homo_sapiens_gca018473295v1_compara_110 -469 compara homo_sapiens_gca021951015v1_compara_110 -565 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/114e90e0-aa35-4af6-9204-267c988328c3/variation.vcf.gz -571 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/ef282def-9a17-4b35-a344-0f0c559e54ab/variation.vcf.gz -576 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/9caa2cae-d1c8-4cfc-9ffd-2e13bc3e95b1/variation.vcf.gz -592 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/75b7ac15-6373-4ad5-9fb7-23813a5355a4/variation.vcf.gz -595 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/1220d766-6fcb-4b80-9106-121f238c0b3d/variation.vcf.gz -597 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/8364a820-5485-42d7-a648-1a5eeb858319/variation.vcf.gz -608 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/2020e8d5-4d87-47af-be78-0b15e48970a7/variation.vcf.gz -610 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/65d4f21f-695a-4ed0-be67-5732a551fea4/variation.vcf.gz -644 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/a733574a-93e7-11ec-a39d-005056b38ce3/variation.vcf.gz -653 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/a73356e1-93e7-11ec-a39d-005056b38ce3/variation.vcf.gz -660 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/a73357ab-93e7-11ec-a39d-005056b38ce3/variation.vcf.gz -670 regulation /nfs/production/flicek/ensembl/regulation/plins/110-mvp-handover/homo_sapiens/GRCh38/homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.bb -673 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/a7335667-93e7-11ec-a39d-005056b38ce3/variation.vcf.gz -674 vcf /nfs/production/flicek/ensembl/variation/new_website/v110/api/3704ceb1-948d-11ec-a39d-005056b38ce3/variation.vcf.gz -4352 compara escherichia_coli_str_k_12_substr_mg1655_compara_110 diff --git a/src/tests/databases/ensembl_genome_metadata/dataset_type.txt b/src/tests/databases/ensembl_genome_metadata/dataset_type.txt deleted file mode 100644 index 010ea40a..00000000 --- a/src/tests/databases/ensembl_genome_metadata/dataset_type.txt +++ /dev/null @@ -1,34 +0,0 @@ -1 assembly Genomic assembly assembly Compilation of sequences for a genome \N \N \N \N -2 genebuild Genomic Build genebuild_annotation Genomic annotations for an assembly \N \N \N \N -3 variation Variation Annotations variation_annotation Variation annotation set \N \N 2 \N -4 evidence Variation Evidence production_process Variation evidence annotation \N 3 \N \N -5 short_variant Short variants production_process Short variant data \N 3 \N \N -6 homologies Comparative homologies compara_annotation Comparative Genomics annotation \N \N 2 \N -7 regulatory_features Regulatory Annotation regulation_annotation Regulatory annotation for an assembly \N \N 2 \N -8 genebuild_compute External References production_process Xref genome annotation for Genebuild \N 2 \N \N -9 genebuild_files Files dumps production_process File Dumps, either internal or for public consumption \N 2 8 \N -11 genebuild_web Web Geneset content production_process Web Geneset related content \N 2 \N \N -12 genebuild_prep Genebuild preparation production_preparation Web Content for Geneset publication \N 2 8,9,11,12 \N -13 xrefs External References production_process External annotations linking \N 8 \N \N -14 protein_features Protein Features annotations production_process Proteins annotation \N 8 13 \N -15 alpha_fold AlphaFold computation production_process Compute Protein structure with Alphafold \N 8 13 \N -16 blast Blast tools production_process Blast Indexes files \N 9 8 \N -17 ftp_dumps Public FTP files production_process Public FTP flat files geneset dumps \N 9 8 \N -18 thoas_dumps Thoas load flat files production_process Dump flat file to load onto THOAS \N 11 \N \N -19 thoas_load Thoas MongoDB Load production_preparation Load dumped files onto THOAS \N 12 18,23 \N -20 genebuild_browser_files Genome Browser BB Geneset files production_process Production BigBed for Genome Browser \N 11 \N \N -21 genebuild_track Geneset Tracks API production_preparation Register Geneset Track API BigBed files \N 12 20 \N -23 checksums Sequences Checksums production_process Compute core sequence checksums and update metadata \N 11 \N \N -24 refget_load Refget Loading production_preparation Load sequences and their checksum onto Refget app \N 12 22 \N -25 homology_compute Homology annotation production_process Compute Genome homology analysis \N 6 \N \N -26 homology_load Homology dataload production_preparation Load homology data onto Compara Service (MongoDB) \N 6 25 \N -27 homology_ftp Homology tsv public files production_preparation Dump and sync public TSV homology files \N 6 25 \N -28 vep VEP filesets variation_annotation VCF annotation file for geneset \N \N \N \N -29 variation_ftp Public Variation files (vcf) production_preparation VCF files for public FTP \N 3 \N \N -31 variation_browser_files Variation Browser files production_process Variation track browser file \N 3 \N \N -32 variation_track Variation Track production_preparation Variation Track API \N 3 31 \N -33 regulation_browser_files Regulation Browser files production_process Regulation track browser file \N 7 \N \N -34 regulation_track Regulation Track production_preparation Regulation Track API \N 7 33 \N -35 regulation_ftp Regulation Public files production_preparation Regulation public files \N 7 33 \N -37 web_genesearch GeneSearch Index production_publication Gene search indexes provisioning \N \N 36 \N -38 web_genomediscovery Genome Search indexes loading to EBI search production_publication Load dumped data from genebuild_web onto EBI Search engine (SpecieSelector) \N \N 37 \N diff --git a/src/tests/databases/ensembl_genome_metadata/ensembl_release.txt b/src/tests/databases/ensembl_genome_metadata/ensembl_release.txt deleted file mode 100644 index 769d578f..00000000 --- a/src/tests/databases/ensembl_genome_metadata/ensembl_release.txt +++ /dev/null @@ -1,6 +0,0 @@ -1 110.1 2023-10-18 MVP Beta-1 1 partial 1 Released 1 -2 110.2 \N MVP Beta-2 0 partial 1 Prepared 2 -3 110.3 \N MVP Beta-3 0 partial 1 Preparing 3 -4 112.0 \N MVP Rel-1 0 partial 1 Planned 4 -5 108.0 2023-06-15 First Beta 0 partial 1 Released 5 -6 114.0 2025-06-15 dataset_test 0 partial 1 Preparing 6 diff --git a/src/tests/databases/ensembl_genome_metadata/ensembl_site.txt b/src/tests/databases/ensembl_genome_metadata/ensembl_site.txt deleted file mode 100644 index 6ef04cbf..00000000 --- a/src/tests/databases/ensembl_genome_metadata/ensembl_site.txt +++ /dev/null @@ -1 +0,0 @@ -1 Ensembl MVP Ensembl https://beta.ensembl.org diff --git a/src/tests/databases/ensembl_genome_metadata/genome.txt b/src/tests/databases/ensembl_genome_metadata/genome.txt deleted file mode 100644 index b81281f4..00000000 --- a/src/tests/databases/ensembl_genome_metadata/genome.txt +++ /dev/null @@ -1,20 +0,0 @@ -1 a73351f7-93e7-11ec-a39d-005056b38ce3 2023-09-22 15:01:44.000000 1 1 0 escherichia_coli_str_k_12_substr_mg1655 EXT01 2018-09 -4 9caa2cae-d1c8-4cfc-9ffd-2e13bc3e95b1 2023-09-22 15:02:00.000000 4 4 0 homo_sapiens_gca018473315v1 ENS01 2022-08 -5 2020e8d5-4d87-47af-be78-0b15e48970a7 2023-09-22 15:02:01.000000 5 5 0 homo_sapiens_gca018469415v1 ENS01 2022-07 -6 75b7ac15-6373-4ad5-9fb7-23813a5355a4 2023-09-22 15:02:02.000000 6 6 0 homo_sapiens_gca018469875v1 ENS01 2022-07 -7 1220d766-6fcb-4b80-9106-121f238c0b3d 2023-09-22 15:02:04.000000 7 7 0 homo_sapiens_gca018505825v1 ENS01 2022-07 -9 c3dcaca8-aaee-479f-aad8-c7a5e17b7e10 2023-09-22 15:02:11.000000 9 9 0 homo_sapiens_gca018852615v1 ENS01 2022-07 -12 a73356e1-93e7-11ec-a39d-005056b38ce3 2023-09-22 15:03:01.000000 15 14 0 plasmodium_falciparum EXT01 2017-10 -19 56d9b469-097f-48a7-8501-c8416bcbcdfb 2023-09-22 15:03:02.000000 18 9 0 homo_sapiens_gca021950905v1 ENS01 2022-07 -31 3704ceb1-948d-11ec-a39d-005056b38ce3 2023-09-22 15:03:22.000000 40 83 0 homo_sapiens_37 GENCODE19 2013-09 -74 a73357ab-93e7-11ec-a39d-005056b38ce3 2023-09-22 15:04:29.000000 79 72 0 triticum_aestivum EXT01 2018-04 -86 a7335667-93e7-11ec-a39d-005056b38ce3 2023-09-22 15:04:45.000000 92 83 0 homo_sapiens GENCODE44 2023-03 -89 8364a820-5485-42d7-a648-1a5eeb858319 2023-09-22 15:04:50.000000 97 7 0 homo_sapiens_gca018505865v1 ENS01 2022-07 -92 63b4ffbf-0147-4aa7-b0af-7575bb822740 2023-09-22 15:04:53.000000 100 9 0 homo_sapiens_gca018852605v1 ENS01 2022-07 -99 ef282def-9a17-4b35-a344-0f0c559e54ab 2023-09-22 15:04:56.000000 107 6 0 homo_sapiens_gca018469925v1 ENS01 2022-07 -125 114e90e0-aa35-4af6-9204-267c988328c3 2023-09-22 15:05:37.000000 135 5 0 homo_sapiens_gca018469425v1 ENS01 2022-07 -169 af073c3e-d087-46b0-bb62-310e89982450 2023-09-22 15:06:39.000000 180 9 0 homo_sapiens_gca021951015v1 ENS01 2022-07 -174 65d4f21f-695a-4ed0-be67-5732a551fea4 2023-09-22 15:06:43.000000 186 4 0 homo_sapiens_gca018473295v1 ENS01 2022-08 -201 a733574a-93e7-11ec-a39d-005056b38ce3 2023-09-22 15:06:55.000000 216 172 0 saccharomyces_cerevisiae EXT01 2018-10 -203 a733550b-93e7-11ec-a39d-005056b38ce3 2023-09-22 15:06:58.000000 219 175 0 caenorhabditis_elegans EXT01 2014-10 -204 99999999-695a-4ed0-be67-5732a551fea4 2023-09-22 15:06:43.000000 186 4 0 homo_sapiens_gca018473295v1 ENS09 2022-08 diff --git a/src/tests/databases/ensembl_genome_metadata/genome_dataset.txt b/src/tests/databases/ensembl_genome_metadata/genome_dataset.txt deleted file mode 100644 index f7541c3f..00000000 --- a/src/tests/databases/ensembl_genome_metadata/genome_dataset.txt +++ /dev/null @@ -1,499 +0,0 @@ -1 1 1 1 5 -2 1 2 1 5 -7 1 7 4 1 -9 1 9 5 1 -11 1 11 6 2 -13 1 13 7 2 -14 1 14 7 2 -17 1 17 9 3 -23 1 23 12 5 -24 1 24 12 5 -37 0 37 19 \N -38 0 38 19 \N -61 1 61 31 5 -62 1 62 31 5 -147 1 147 74 5 -148 1 148 74 5 -171 1 171 86 5 -172 1 172 86 5 -177 1 177 89 3 -178 1 178 89 3 -183 0 183 92 \N -184 0 184 92 \N -197 1 197 99 2 -249 1 249 125 2 -250 1 250 125 2 -337 0 337 169 \N -338 0 338 169 \N -347 1 347 174 1 -348 1 348 174 1 -401 1 401 201 5 -402 1 402 201 5 -405 1 405 203 5 -406 1 406 203 5 -887 1 888 4 1 -888 1 890 5 1 -889 1 892 6 2 -891 1 896 9 3 -956 1 1006 99 2 -1332 1 1391 86 5 -1333 1 1392 31 5 -1334 1 1393 86 5 -1335 1 1394 31 5 -1346 1 1405 201 5 -1355 1 1414 12 5 -1362 1 1421 74 5 -1405 1 1464 125 2 -1411 1 1470 99 2 -1416 1 1475 4 1 -1432 1 1491 6 2 -1435 1 1494 7 2 -1437 1 1496 89 3 -1448 1 1507 5 1 -1450 1 1509 174 1 -1469 1 1528 201 5 -1478 1 1537 12 5 -1485 1 1544 74 5 -2217 1 2276 4 1 -2232 1 2291 31 5 -2260 1 2319 99 2 -2289 1 2348 12 5 -2298 1 2357 201 5 -2325 1 2384 74 5 -2335 1 2394 203 5 -2349 1 2408 5 1 -2390 1 2449 86 5 -2395 0 2454 19 \N -2399 1 2458 1 5 -2423 1 2482 174 1 -2435 0 2494 169 \N -2459 1 2518 86 5 -6538 0 6593 174 2 -6568 0 6623 31 2 -6644 0 6699 12 2 -6794 1 6849 89 \N -6841 1 6896 6 2 -7014 0 7069 203 2 -7122 1 7177 86 2 -7265 1 7320 125 2 -7480 0 7535 5 2 -7548 0 7603 74 2 -7730 1 7785 7 2 -7765 0 7820 4 2 -7792 0 7847 201 2 -8075 0 8130 92 \N -8337 0 8392 1 2 -8606 1 8661 9 \N -8607 0 8662 174 1 -8608 0 8663 174 1 -8609 0 8664 174 1 -8610 0 8665 174 1 -8611 0 8666 174 1 -8612 0 8667 174 1 -8613 0 8668 174 1 -8614 0 8669 174 1 -8615 0 8670 174 1 -8616 0 8671 174 1 -8617 0 8672 174 1 -8618 0 8673 174 1 -8619 0 8674 174 1 -8620 0 8675 174 1 -8621 0 8676 174 1 -8622 0 8677 4 1 -8623 0 8678 4 1 -8624 0 8679 4 1 -8625 0 8680 4 1 -8626 0 8681 4 1 -8627 0 8682 4 1 -8628 0 8683 4 1 -8629 0 8684 4 1 -8630 0 8685 4 1 -8631 0 8686 4 1 -8632 0 8687 4 1 -8633 0 8688 4 1 -8634 0 8689 4 1 -8635 0 8690 4 1 -8636 0 8691 4 1 -8637 0 8692 5 1 -8638 0 8693 5 1 -8639 0 8694 5 1 -8640 0 8695 5 1 -8641 0 8696 5 1 -8642 0 8697 5 1 -8643 0 8698 5 1 -8644 0 8699 5 1 -8645 0 8700 5 1 -8646 0 8701 5 1 -8647 0 8702 5 1 -8648 0 8703 5 1 -8649 0 8704 5 1 -8650 0 8705 5 1 -8651 0 8706 5 1 -8652 0 8707 4 1 -8653 0 8708 4 1 -8654 0 8709 4 1 -8655 0 8710 4 1 -8656 0 8711 4 1 -8657 0 8712 5 1 -8658 0 8713 5 1 -8659 0 8714 5 1 -8660 0 8715 5 1 -8661 0 8716 5 1 -8662 0 8717 174 1 -8663 0 8718 174 1 -8664 0 8719 174 1 -8665 0 8720 174 1 -8666 0 8721 174 1 -8667 0 8722 4 1 -8668 0 8723 4 1 -8669 0 8724 4 1 -8670 0 8725 5 1 -8671 0 8726 5 1 -8672 0 8727 5 1 -8673 0 8728 174 1 -8674 0 8729 174 1 -8675 0 8730 174 1 -8676 0 8731 7 2 -8677 0 8732 7 2 -8678 0 8733 7 2 -8679 0 8734 7 2 -8680 0 8735 7 2 -8681 0 8736 7 2 -8682 0 8737 7 2 -8683 0 8738 7 2 -8684 0 8739 7 2 -8685 0 8740 7 2 -8686 0 8741 7 2 -8687 0 8742 7 2 -8688 0 8743 7 2 -8689 0 8744 7 2 -8690 0 8745 7 2 -8691 0 8746 125 2 -8692 0 8747 125 2 -8693 0 8748 125 2 -8694 0 8749 125 2 -8695 0 8750 125 2 -8696 0 8751 125 2 -8697 0 8752 125 2 -8698 0 8753 125 2 -8699 0 8754 125 2 -8700 0 8755 125 2 -8701 0 8756 125 2 -8702 0 8757 125 2 -8703 0 8758 125 2 -8704 0 8759 125 2 -8705 0 8760 125 2 -8706 0 8761 6 2 -8707 0 8762 6 2 -8708 0 8763 6 2 -8709 0 8764 6 2 -8710 0 8765 6 2 -8711 0 8766 6 2 -8712 0 8767 6 2 -8713 0 8768 6 2 -8714 0 8769 6 2 -8715 0 8770 6 2 -8716 0 8771 6 2 -8717 0 8772 6 2 -8718 0 8773 6 2 -8719 0 8774 6 2 -8720 0 8775 6 2 -8721 0 8776 99 2 -8722 0 8777 99 2 -8723 0 8778 99 2 -8724 0 8779 99 2 -8725 0 8780 99 2 -8726 0 8781 99 2 -8727 0 8782 99 2 -8728 0 8783 99 2 -8729 0 8784 99 2 -8730 0 8785 99 2 -8731 0 8786 99 2 -8732 0 8787 99 2 -8733 0 8788 99 2 -8734 0 8789 99 2 -8735 0 8790 99 2 -8736 0 8791 125 2 -8737 0 8792 125 2 -8738 0 8793 125 2 -8739 0 8794 125 2 -8740 0 8795 125 2 -8741 0 8796 99 2 -8742 0 8797 99 2 -8743 0 8798 99 2 -8744 0 8799 99 2 -8745 0 8800 99 2 -8746 0 8801 6 2 -8747 0 8802 6 2 -8748 0 8803 6 2 -8749 0 8804 6 2 -8750 0 8805 6 2 -8751 0 8806 7 2 -8752 0 8807 7 2 -8753 0 8808 7 2 -8754 0 8809 7 2 -8755 0 8810 7 2 -8756 0 8811 99 2 -8757 0 8812 99 2 -8758 0 8813 99 2 -8759 0 8814 174 2 -8760 0 8815 174 2 -8761 0 8816 174 2 -8762 0 8817 31 2 -8763 0 8818 31 2 -8764 0 8819 31 2 -8765 0 8820 12 2 -8766 0 8821 12 2 -8767 0 8822 12 2 -8768 0 8823 6 2 -8769 0 8824 6 2 -8770 0 8825 6 2 -8771 0 8826 203 2 -8772 0 8827 203 2 -8773 0 8828 203 2 -8774 0 8829 86 2 -8775 0 8830 86 2 -8776 0 8831 86 2 -8777 0 8832 125 2 -8778 0 8833 125 2 -8779 0 8834 125 2 -8780 0 8835 5 2 -8781 0 8836 5 2 -8782 0 8837 5 2 -8783 0 8838 74 2 -8784 0 8839 74 2 -8785 0 8840 74 2 -8786 0 8841 7 2 -8787 0 8842 7 2 -8788 0 8843 7 2 -8789 0 8844 4 2 -8790 0 8845 4 2 -8791 0 8846 4 2 -8792 0 8847 201 2 -8793 0 8848 201 2 -8794 0 8849 201 2 -8795 0 8850 1 2 -8796 0 8851 1 2 -8797 0 8852 1 2 -8798 0 8853 89 3 -8799 0 8854 89 3 -8800 0 8855 89 3 -8801 0 8856 89 3 -8802 0 8857 89 3 -8803 0 8858 89 3 -8804 0 8859 89 3 -8805 0 8860 89 3 -8806 0 8861 89 3 -8807 0 8862 89 3 -8808 0 8863 89 3 -8809 0 8864 9 3 -8810 0 8865 9 3 -8811 0 8866 9 3 -8812 0 8867 9 3 -8813 0 8868 9 3 -8814 0 8869 9 3 -8815 0 8870 9 3 -8816 0 8871 9 3 -8817 0 8872 9 3 -8818 0 8873 9 3 -8819 0 8874 9 3 -8820 0 8875 89 3 -8821 0 8876 89 3 -8822 0 8877 89 3 -8823 0 8878 1 5 -8824 0 8879 1 5 -8825 0 8880 1 5 -8826 0 8881 1 5 -8827 0 8882 1 5 -8828 0 8883 1 5 -8829 0 8884 1 5 -8830 0 8885 1 5 -8831 0 8886 1 5 -8832 0 8887 1 5 -8833 0 8888 1 5 -8834 0 8889 1 5 -8835 0 8890 1 5 -8836 0 8891 1 5 -8837 0 8892 1 5 -8838 0 8893 12 5 -8839 0 8894 12 5 -8840 0 8895 12 5 -8841 0 8896 12 5 -8842 0 8897 12 5 -8843 0 8898 12 5 -8844 0 8899 12 5 -8845 0 8900 12 5 -8846 0 8901 12 5 -8847 0 8902 12 5 -8848 0 8903 12 5 -8849 0 8904 12 5 -8850 0 8905 12 5 -8851 0 8906 12 5 -8852 0 8907 12 5 -8853 0 8908 31 5 -8854 0 8909 31 5 -8855 0 8910 31 5 -8856 0 8911 31 5 -8857 0 8912 31 5 -8858 0 8913 31 5 -8859 0 8914 31 5 -8860 0 8915 31 5 -8861 0 8916 31 5 -8862 0 8917 31 5 -8863 0 8918 31 5 -8864 0 8919 31 5 -8865 0 8920 31 5 -8866 0 8921 31 5 -8867 0 8922 31 5 -8868 0 8923 74 5 -8869 0 8924 74 5 -8870 0 8925 74 5 -8871 0 8926 74 5 -8872 0 8927 74 5 -8873 0 8928 74 5 -8874 0 8929 74 5 -8875 0 8930 74 5 -8876 0 8931 74 5 -8877 0 8932 74 5 -8878 0 8933 74 5 -8879 0 8934 74 5 -8880 0 8935 74 5 -8881 0 8936 74 5 -8882 0 8937 74 5 -8883 0 8938 86 5 -8884 0 8939 86 5 -8885 0 8940 86 5 -8886 0 8941 86 5 -8887 0 8942 86 5 -8888 0 8943 86 5 -8889 0 8944 86 5 -8890 0 8945 86 5 -8891 0 8946 86 5 -8892 0 8947 86 5 -8893 0 8948 86 5 -8894 0 8949 86 5 -8895 0 8950 86 5 -8896 0 8951 86 5 -8897 0 8952 86 5 -8898 0 8953 201 5 -8899 0 8954 201 5 -8900 0 8955 201 5 -8901 0 8956 201 5 -8902 0 8957 201 5 -8903 0 8958 201 5 -8904 0 8959 201 5 -8905 0 8960 201 5 -8906 0 8961 201 5 -8907 0 8962 201 5 -8908 0 8963 201 5 -8909 0 8964 201 5 -8910 0 8965 201 5 -8911 0 8966 201 5 -8912 0 8967 201 5 -8913 0 8968 203 5 -8914 0 8969 203 5 -8915 0 8970 203 5 -8916 0 8971 203 5 -8917 0 8972 203 5 -8918 0 8973 203 5 -8919 0 8974 203 5 -8920 0 8975 203 5 -8921 0 8976 203 5 -8922 0 8977 203 5 -8923 0 8978 203 5 -8924 0 8979 203 5 -8925 0 8980 203 5 -8926 0 8981 203 5 -8927 0 8982 203 5 -8928 0 8983 86 5 -8929 0 8984 86 5 -8930 0 8985 86 5 -8931 0 8986 86 5 -8932 0 8987 86 5 -8933 0 8988 31 5 -8934 0 8989 31 5 -8935 0 8990 31 5 -8936 0 8991 31 5 -8937 0 8992 31 5 -8938 0 8993 201 5 -8939 0 8994 201 5 -8940 0 8995 201 5 -8941 0 8996 201 5 -8942 0 8997 201 5 -8943 0 8998 12 5 -8944 0 8999 12 5 -8945 0 9000 12 5 -8946 0 9001 12 5 -8947 0 9002 12 5 -8948 0 9003 74 5 -8949 0 9004 74 5 -8950 0 9005 74 5 -8951 0 9006 74 5 -8952 0 9007 74 5 -8953 0 9008 31 5 -8954 0 9009 31 5 -8955 0 9010 31 5 -8956 0 9011 12 5 -8957 0 9012 12 5 -8958 0 9013 12 5 -8959 0 9014 201 5 -8960 0 9015 201 5 -8961 0 9016 201 5 -8962 0 9017 74 5 -8963 0 9018 74 5 -8964 0 9019 74 5 -8965 0 9020 203 5 -8966 0 9021 203 5 -8967 0 9022 203 5 -8968 0 9023 86 5 -8969 0 9024 86 5 -8970 0 9025 86 5 -8971 0 9026 1 5 -8972 0 9027 1 5 -8973 0 9028 1 5 -8974 0 9029 86 5 -8975 0 9030 86 5 -8976 0 9031 86 5 -8977 0 9032 19 \N -8978 0 9033 19 \N -8979 0 9034 19 \N -8980 0 9035 19 \N -8981 0 9036 19 \N -8982 0 9037 19 \N -8983 0 9038 19 \N -8984 0 9039 19 \N -8985 0 9040 19 \N -8986 0 9041 19 \N -8987 0 9042 19 \N -8988 0 9043 92 \N -8989 0 9044 92 \N -8990 0 9045 92 \N -8991 0 9046 92 \N -8992 0 9047 92 \N -8993 0 9048 92 \N -8994 0 9049 92 \N -8995 0 9050 92 \N -8996 0 9051 92 \N -8997 0 9052 92 \N -8998 0 9053 92 \N -8999 0 9054 169 \N -9000 0 9055 169 \N -9001 0 9056 169 \N -9002 0 9057 169 \N -9003 0 9058 169 \N -9004 0 9059 169 \N -9005 0 9060 169 \N -9006 0 9061 169 \N -9007 0 9062 169 \N -9008 0 9063 169 \N -9009 0 9064 169 \N -9010 0 9065 19 \N -9011 0 9066 169 \N -9012 0 9067 89 \N -9013 0 9068 92 \N -9014 0 9069 9 \N -9015 0 9070 86 \N -9016 0 9071 86 \N -9017 0 9072 204 6 -9018 0 9073 204 6 -9019 0 9074 204 6 \ No newline at end of file diff --git a/src/tests/databases/ensembl_genome_metadata/genome_release.txt b/src/tests/databases/ensembl_genome_metadata/genome_release.txt deleted file mode 100644 index 217aa3a1..00000000 --- a/src/tests/databases/ensembl_genome_metadata/genome_release.txt +++ /dev/null @@ -1,30 +0,0 @@ -1 1 4 1 -2 1 5 1 -3 1 174 1 -4 1 6 2 -5 1 7 2 -6 1 99 2 -7 1 125 2 -8 1 9 3 -9 1 89 3 -10 1 1 5 -11 1 12 5 -12 1 31 5 -13 1 74 5 -14 1 86 5 -15 1 201 5 -16 1 203 5 -17 0 174 2 -18 0 31 2 -19 0 12 2 -20 0 203 2 -21 0 86 2 -22 0 5 2 -23 0 74 2 -24 0 4 2 -25 0 201 2 -26 0 1 2 -27 0 19 4 -28 0 92 4 -29 0 169 4 -30 0 204 6 diff --git a/src/tests/databases/ensembl_genome_metadata/ncbi_taxa_name.txt b/src/tests/databases/ensembl_genome_metadata/ncbi_taxa_name.txt deleted file mode 100644 index 5555cf0c..00000000 --- a/src/tests/databases/ensembl_genome_metadata/ncbi_taxa_name.txt +++ /dev/null @@ -1,469 +0,0 @@ -562 Achromobacter sp. ATCC 35328 includes -562 ATCC 11775 type material -562 "Bacillus coli" Migula 1895 authority -562 Bacillus coli synonym -562 bacterium 10a includes -562 "Bacterium coli commune" Escherich 1885 authority -562 Bacterium coli commune synonym -562 "Bacterium coli" (Migula 1895) Lehmann and Neumann 1896 authority -562 Bacterium coli synonym -562 bacterium E3 includes -562 CCUG 24 type material -562 CCUG 29300 type material -562 CIP 54.8 type material -562 DSM 30083 type material -562 E. coli common name -562 Enterococcus coli synonym -562 Escherichia coli (Migula 1895) Castellani and Chalmers 1919 authority -562 Escherichia coli scientific name -562 Escherichia/Shigella coli equivalent name -562 Escherichia sp. 3_2_53FAA includes -562 Escherichia sp. MAR includes -562 IAM 12119 type material -562 JCM 1649 type material -562 LMG 2092 type material -562 LMG:2092 type material -562 NBRC 102203 type material -562 NCCB 54008 type material -562 NCTC 9001 type material -562 strain U5/41 type material -3702 Arabidopsis thaliana (L.) Heynh., 1842 authority -3702 Arabidopsis thaliana scientific name -3702 Arabis thaliana L., 1753 authority -3702 Arabis thaliana synonym -3702 mouse-ear cress common name -3702 thale-cress common name -3702 thale cress genbank common name -3708 Brassica napus L., 1753 authority -3708 Brassica napus scientific name -3708 oilseed rape common name -3708 rape genbank common name -3708 rapeseeds common name -3711 Brassica rapa L., 1753 authority -3711 Brassica rapa scientific name -3711 field mustard genbank common name -3712 Brassica oleracea L., 1753 authority -3712 Brassica oleracea scientific name -3712 wild cabbage genbank common name -3847 Glycine max (L.) Merr., 1917 authority -3847 Glycine max scientific name -3847 Phaseolus max L., 1753 authority -3847 Phaseolus max synonym -3847 soybean genbank common name -3847 soybeans common name -3880 barrel medic genbank common name -3880 Medicago truncatula Gaertn., 1790 authority -3880 Medicago truncatula scientific name -4081 Lycopersicon esculentum Mill. authority -4081 Lycopersicon esculentum synonym -4081 Lycopersicon esculentum var. esculentum synonym -4081 Solanum esculentum Dunal authority -4081 Solanum esculentum synonym -4081 Solanum lycopersicum L., 1753 authority -4081 Solanum lycopersicum scientific name -4081 Solanum lycopersicum var. humboldtii synonym -4081 tomato genbank common name -4113 potatoes common name -4113 potato genbank common name -4113 Solanum tuberosum L., 1753 authority -4113 Solanum tuberosum scientific name -4113 Solanum tuberosum subsp. tuberosum includes -4513 barley common name -4513 Hordeum vulgare L., 1753 authority -4513 Hordeum vulgare scientific name -4530 Asian cultivated rice genbank common name -4530 Oryza sativa L., 1753 authority -4530 Oryza sativa scientific name -4530 red rice common name -4530 rice common name -4558 Andropogon sorghum (L.) Brot. authority -4558 Andropogon sorghum synonym -4558 broomcorn common name -4558 milo common name -4558 Sorghum bicolor (L.) Moench, 1794 authority -4558 Sorghum bicolor scientific name -4558 Sorghum bicolor subsp. bicolor synonym -4558 sorghum genbank common name -4558 Sorghum nervosum Besser ex Schult. authority -4558 Sorghum nervosum synonym -4558 Sorghum saccharatum (L.) Moench authority -4558 Sorghum saccharatum synonym -4558 Sorghum vulgare Pers. authority -4558 Sorghum vulgare synonym -4565 bread wheat genbank common name -4565 Canadian hard winter wheat common name -4565 common wheat common name -4565 Triticum aestivum L., 1753 authority -4565 Triticum aestivum scientific name -4565 Triticum aestivum subsp. aestivum synonym -4565 Triticum vulgare synonym -4565 Triticum vulgare Vill., 1787 authority -4565 wheat common name -4567 durum wheat genbank common name -4567 Triticum durum Desf. authority -4567 Triticum durum ssp. durum synonym -4567 Triticum durum subsp. durum synonym -4567 Triticum durum synonym -4567 Triticum rigidum conv. durum synonym -4567 Triticum rigidum ssp. durum synonym -4567 Triticum rigidum var. durum synonym -4567 Triticum turgidum Durum Group synonym -4567 Triticum turgidum subsp. durum (Desf.) Husn., 1899 authority -4567 Triticum turgidum subsp. durum scientific name -4571 cone wheat common name -4571 English wheat common name -4571 poulard wheat common name -4571 rivet wheat common name -4571 Triticum aethiopicum Jakubz., 1947 authority -4571 Triticum aethiopicum synonym -4571 Triticum durum subsp. abyssinicum synonym -4571 Triticum durum subsp. abyssinicum Vavilov, 1931 authority -4571 Triticum turgidum L., 1753 authority -4571 Triticum turgidum scientific name -4577 maize common name -4577 Zea mays L., 1753 authority -4577 Zea mays scientific name -4577 Zea mays var. japonica synonym -4932 ATCC 18824 type material -4932 baker's yeast common name -4932 brewer's yeast genbank common name -4932 Candida robusta synonym -4932 CBS 1171 type material -4932 Mycoderma cerevisiae Desm., 1827 authority -4932 Mycoderma cerevisiae synonym -4932 NRRL Y-12632 type material -4932 Saccharomyces capensis synonym -4932 Saccharomyces cerevisiae (Desm.) Meyen, 1838 authority -4932 Saccharomyces cerevisiae scientific name -4932 Saccharomyces cerevisiae 'var. diastaticus' equivalent name -4932 Saccharomyces diastaticus J. Andrews & R.B. Gilliland ex Van der Walt, 1965 authority -4932 Saccharomyces diastaticus synonym -4932 Saccharomyces italicus synonym -4932 Saccharomyces oviformis synonym -4932 Saccharomyces uvarum var. melibiosus synonym -4932 specimen-voucher:NRRL:Y:12632 type material -5833 malaria parasite P. falciparum genbank common name -5833 Plasmodium falciparum scientific name -5833 Plasmodium (Laverania) falciparum synonym -6239 Caenorhabditis elegans (Maupas, 1900) authority -6239 Caenorhabditis elegans scientific name -6239 Rhabditis elegans Maupas, 1900 authority -6239 Rhabditis elegans synonym -7227 Diptera sp. DNAS-2A9-224646 includes -7227 Drosophila melanogaster Meigen, 1830 authority -7227 Drosophila melanogaster scientific name -7227 fruit fly genbank common name -7227 Sophophora melanogaster (Meigen, 1830) authority -7227 Sophophora melanogaster synonym -7955 Brachydanio rerio frankei synonym -7955 Brachydanio rerio synonym -7955 Cyprinus rerio Hamilton, 1822 authority -7955 Cyprinus rerio synonym -7955 Danio frankei synonym -7955 Danio rerio frankei synonym -7955 Danio rerio (Hamilton, 1822) authority -7955 Danio rerio scientific name -7955 leopard danio common name -7955 zebra danio common name -7955 zebrafish genbank common name -7955 zebra fish common name -7994 Astyanax mexicanus (De Filippi, 1853) authority -7994 Astyanax mexicanus scientific name -7994 blind cave fish common name -7994 Mexican tetra genbank common name -7994 Tetragonopterus mexicanus De Filippi, 1853 authority -7994 Tetragonopterus mexicanus synonym -8030 Atlantic salmon genbank common name -8030 Salmo salar Linnaeus, 1758 authority -8030 Salmo salar scientific name -8090 Japanese medaka genbank common name -8090 Japanese rice fish common name -8090 medaka common name -8090 Oryzias latipes scientific name -8090 Oryzias latipes (Temminck & Schlegel, 1846) authority -8090 Poecilia latipes synonym -8090 Poecilia latipes Temminck & Schlegel, 1846 authority -8128 Nile tilapia genbank common name -8128 Oreochromis nilotica synonym -8128 Oreochromis niloticus (Linnaeus, 1758) authority -8128 Oreochromis niloticus scientific name -8128 Perca nilotica Linnaeus, 1758 authority -8128 Perca nilotica synonym -8128 Tilapia nilotica synonym -8364 Silurana tropicalis Gray, 1864 authority -8364 Silurana tropicalis synonym -8364 tropical clawed frog genbank common name -8364 western clawed frog common name -8364 Xenopus laevis tropicalis synonym -8364 Xenopus (Silurana) tropicalis synonym -8364 Xenopus tropicalis (Gray, 1864) authority -8364 Xenopus tropicalis scientific name -9031 bantam common name -9031 chicken genbank common name -9031 chickens common name -9031 dwarf Leghorn chickens includes -9031 Gallus domesticus equivalent name -9031 Gallus gallus domesticus synonym -9031 Gallus gallus scientific name -9031 Phasianus gallus Linnaeus, 1758 authority -9031 Phasianus gallus synonym -9031 red junglefowl includes -9413 greater false vampire bat common name -9413 Indian false vampire genbank common name -9413 Megaderma lyra Saint-Hilaire, 1810 authority -9413 Megaderma lyra scientific name -9544 Cercopithecus mulatta synonym -9544 Cercopithecus mulatta Zimmermann, 1780 authority -9544 Macaca mulatta scientific name -9544 Macaca mulatta (Zimmermann, 1780) authority -9544 rhesus macaque common name -9544 rhesus macaques common name -9544 Rhesus monkey genbank common name -9544 rhesus monkeys common name -9597 bonobo common name -9597 Pan paniscus Schwarz, 1929 authority -9597 Pan paniscus scientific name -9597 pygmy chimpanzee genbank common name -9598 chimpanzee genbank common name -9598 Pan troglodytes scientific name -9598 Simia troglodytes Linnaeus, 1758 authority -9598 Simia troglodytes synonym -9606 Homo sapiens Linnaeus, 1758 authority -9606 Homo sapiens scientific name -9606 human genbank common name -9612 Canis lupus Linnaeus, 1758 authority -9612 Canis lupus scientific name -9612 gray wolf genbank common name -9612 grey wolf common name -9615 beagle dog includes -9615 beagle dogs includes -9615 Canis canis synonym -9615 Canis domesticus synonym -9615 Canis familiaris Linnaeus, 1758 authority -9615 Canis familiaris synonym -9615 Canis lupus familiaris Linnaeus, 1758 authority -9615 Canis lupus familiaris scientific name -9615 dog genbank common name -9615 dogs common name -9685 cat common name -9685 cats common name -9685 domestic cat genbank common name -9685 Felis catus Linnaeus, 1758 authority -9685 Felis catus scientific name -9685 Felis domesticus synonym -9685 Felis silvestris catus synonym -9685 Korat cats includes -9685 Korat cats L. authority -9796 domestic horse common name -9796 equine common name -9796 Equus caballus Linnaeus, 1758 authority -9796 Equus caballus scientific name -9796 Equus przewalskii f. caballus synonym -9796 Equus przewalskii forma caballus synonym -9796 horse genbank common name -9823 pig genbank common name -9823 pigs common name -9823 Sus scrofa Linnaeus, 1758 authority -9823 Sus scrofa scientific name -9823 swine common name -9823 wild boar common name -9913 Bos bovis synonym -9913 Bos primigenius taurus synonym -9913 Bos taurus Linnaeus, 1758 authority -9913 Bos taurus scientific name -9913 Bovidae sp. Adi Nefas includes -9913 bovine common name -9913 cattle genbank common name -9913 cow common name -9913 dairy cow common name -9913 domestic cattle common name -9913 domestic cow common name -9913 ox common name -9925 African dwarf goat includes -9925 African dwarf goats includes -9925 Capra aegagrus hircus synonym -9925 Capra hircus Linnaeus, 1758 authority -9925 Capra hircus scientific name -9925 domestic goat common name -9925 goat genbank common name -9925 goats common name -9925 Naine d'Afrique de l'Ouest includes -9940 domestic sheep common name -9940 lambs common name -9940 Ovis ammon aries synonym -9940 Ovis aries Linnaeus, 1758 authority -9940 Ovis aries scientific name -9940 Ovis orientalis aries synonym -9940 Ovis ovis synonym -9940 sheep genbank common name -9940 wild sheep common name -9986 domestic rabbit common name -9986 European rabbit common name -9986 Japanese white rabbit common name -9986 Lepus cuniculus Linnaeus, 1758 authority -9986 Lepus cuniculus synonym -9986 New Zealand rabbit includes -9986 Oryctolagus cuniculus scientific name -9986 rabbit genbank common name -9986 rabbits common name -10029 Chinese hamster genbank common name -10029 Chinese hamsters common name -10029 CHO cell lines includes -10029 Cricetulus aureus equivalent name -10029 Cricetulus barabensis griseus synonym -10029 Cricetulus griseus Milne-Edwards, 1867 authority -10029 Cricetulus griseus scientific name -10089 Mus caroli Bonhote, 1902 authority -10089 Mus caroli scientific name -10089 Mus formosanus Kuroda, 1925 authority -10089 Mus formosanus synonym -10089 ricefield mouse common name -10089 Ryukyu mouse genbank common name -10090 house mouse genbank common name -10090 LK3 transgenic mice includes -10090 mouse common name -10090 Mus musculus Linnaeus, 1758 authority -10090 Mus musculus scientific name -10090 Mus sp. 129SV includes -10090 nude mice includes -10090 transgenic mice includes -10091 Mus castaneus synonym -10091 Mus musculus castaneus scientific name -10091 Mus musculus castaneus Waterhouse, 1843 authority -10091 southeastern Asian house mouse genbank common name -10092 Mus domesticus synonym -10092 Mus musculus domesticus Schwarz & Scharz 1943 authority -10092 Mus musculus domesticus scientific name -10092 Mus musculus praetextus synonym -10092 Mus praetextus synonym -10092 western European house mouse genbank common name -10093 Coelomys parahi synonym -10093 Gairdner's shrew-mouse common name -10093 Gairdner's shrewmouse common name -10093 Mus pahari scientific name -10093 Mus pahari Thomas, 1916 authority -10093 shrew mouse genbank common name -10096 Algerian mouse common name -10096 Mus musculus spretus synonym -10096 Mus spretus Lataste, 1883 authority -10096 Mus spretus scientific name -10096 western wild mouse genbank common name -10116 brown rat common name -10116 Buffalo rat includes -10116 laboratory rat includes -10116 Mus norvegicus Berkenhout, 1769 authority -10116 Mus norvegicus synonym -10116 Norway rat genbank common name -10116 rat common name -10116 rats common name -10116 Rattus norvegicus scientific name -10116 Rattus PC12 clone IS includes -10116 Rattus sp. strain Wistar includes -10116 Sprague-Dawley rat includes -10116 Wistar rats includes -10116 zitter rats includes -13616 Didelphys domestica synonym -13616 Didelphys domestica Wagner, 1842 authority -13616 gray short-tailed opossum genbank common name -13616 Monodelphis domestica scientific name -29760 Vitis vinifera L., 1753 authority -29760 Vitis vinifera scientific name -29760 Vitis vinifera subsp. vinifera synonym -29760 wine grape genbank common name -36329 Plasmodium falciparum 3D7 scientific name -36329 Plasmodium falciparum (isolate 3D7) synonym -37682 Aegilops squarrosa subsp. squarrosa synonym -37682 Aegilops squarrosa synonym -37682 Aegilops tauschii Coss., 1849 authority -37682 Aegilops tauschii scientific name -37682 Patropyrum tauschii (Coss.) A.Love authority -37682 Patropyrum tauschii subsp. tauschii synonym -37682 Patropyrum tauschii synonym -37682 Triticum aegilops P.Beauv. ex Roem. & Schult. authority -37682 Triticum aegilops synonym -37682 Triticum tauschii (Coss.) Schmalh. authority -37682 Triticum tauschii synonym -39442 eastern European house mouse genbank common name -39442 Mus musculus hortulanus synonym -39442 Mus musculus musculus scientific name -39946 Indian rice common name -39946 Indica rice common name -39946 long-grained rice genbank common name -39946 Oryza sativa (indica cultivar-group) synonym -39946 Oryza sativa Indica Group scientific name -39946 Oryza sativa (indica group) synonym -39946 Oryza sativa subsp. indica Kato authority -39946 Oryza sativa subsp. indica synonym -39946 Oryza sp. Poi-6 includes -39947 Japanese rice genbank common name -39947 Japonica rice common name -39947 Oryza sativa (japonica cultivar-group) synonym -39947 Oryza sativa Japonica Group scientific name -39947 Oryza sativa subsp. japonica synonym -109376 Brassica oleracea subsp. oleracea synonym -109376 Brassica oleracea var. oleracea scientific name -112509 domesticated barley genbank common name -112509 Hordeum sativum Jess. authority -112509 Hordeum sativum synonym -112509 Hordeum vulgare subsp. vulgare scientific name -112509 Hordeum vulgare subsp. vulgare Spenn. authority -112509 Hordeum vulgare var. nudum Spenn. authority -112509 Hordeum vulgare var. nudum synonym -112509 Hordeum vulgare var. vulgare synonym -112509 two-rowed barley common name -200361 Aegilops tauschii subsp. strangulata (Eig) Tzvelev, 1973 authority -200361 Aegilops tauschii subsp. strangulata scientific name -511145 Escherichia coli MG1655 synonym -511145 Escherichia coli strain MG1655 equivalent name -511145 Escherichia coli str. K12 substr. MG1655 equivalent name -511145 Escherichia coli str. K-12 substr. MG1655 scientific name -511145 Escherichia coli str. MG1655 equivalent name -559292 Saccharomyces cerevisiae S288C scientific name -1736656 Oryza sativa (javanica cultivar-group) synonym -1736656 Oryza sativa tropical japonica cultivar-group synonym -1736656 Oryza sativa tropical japonica group synonym -1736656 Oryza sativa tropical japonica subgroup scientific name -1736656 Oryza sativa var. javanica Koern. authority -1736656 Oryza sativa var. javanica synonym -1736658 Oryza sativa Aromatic Japonica Group synonym -1736658 Oryza sativa aromatic subgroup scientific name -1736658 Oryza sativa Group V synonym -1736659 Oryza sativa aus cultivar-group synonym -1736659 Oryza sativa aus group synonym -1736659 Oryza sativa aus subgroup scientific name -1736659 Oryza sativa aus synonym -3711 3706 merged_taxon_id -37682 4482 merged_taxon_id -112509 4514 merged_taxon_id -4577 4578 merged_taxon_id -9940 9936 merged_taxon_id -9986 9985 merged_taxon_id -7955 27702 merged_taxon_id -9986 34833 merged_taxon_id -10116 36465 merged_taxon_id -9685 36475 merged_taxon_id -9544 36502 merged_taxon_id -3708 36503 merged_taxon_id -4513 36528 merged_taxon_id -7955 37966 merged_taxon_id -4565 39424 merged_taxon_id -37682 40669 merged_taxon_id -4932 41870 merged_taxon_id -9925 57076 merged_taxon_id -8128 61227 merged_taxon_id -37682 70688 merged_taxon_id -4571 77607 merged_taxon_id -10090 85055 merged_taxon_id -4113 90692 merged_taxon_id -10029 143285 merged_taxon_id -4081 195582 merged_taxon_id -10092 210727 merged_taxon_id -4565 235075 merged_taxon_id -9913 272461 merged_taxon_id -4530 389215 merged_taxon_id -562 469598 merged_taxon_id -562 662101 merged_taxon_id -562 662104 merged_taxon_id -562 1637691 merged_taxon_id -562 1806490 merged_taxon_id -7227 2267365 merged_taxon_id diff --git a/src/tests/databases/ensembl_genome_metadata/ncbi_taxa_node.txt b/src/tests/databases/ensembl_genome_metadata/ncbi_taxa_node.txt deleted file mode 100644 index 90b0b774..00000000 --- a/src/tests/databases/ensembl_genome_metadata/ncbi_taxa_node.txt +++ /dev/null @@ -1,64 +0,0 @@ -562 561 species 1 576069 582930 1 -3702 3701 species 1 2083678 2083679 1 -3708 3705 species 1 2087484 2087491 1 -3711 3705 species 1 2087500 2087531 1 -3712 3705 species 1 2087532 2087565 1 -3847 1462606 species 1 2065183 2065184 1 -3880 3877 species 1 2056937 2056944 1 -4081 49274 species 1 1940956 1940959 1 -4113 4107 species 1 1940913 1940916 1 -4513 4512 species 1 1729674 1729689 1 -4530 4527 species 0 1724403 1724424 1 -4558 4557 species 1 1712790 1712793 1 -4565 4564 species 1 1730800 1730831 1 -4567 4571 subspecies 1 1730853 1730854 1 -4571 4564 species 1 1730852 1730867 1 -4577 4575 species 1 1712922 1712933 1 -4932 4930 species 1 2220338 2220971 1 -5833 418107 species 1 4998850 4999195 1 -6239 6237 species 1 2641603 2641604 1 -7227 32351 species 1 3734717 3734718 1 -7955 7954 species 1 4736348 4736349 1 -7994 7993 species 1 4744616 4744617 1 -8030 8028 species 1 4766921 4766922 1 -8090 8089 species 1 4802895 4802896 1 -8128 8139 species 1 4796928 4796935 1 -8364 8363 species 1 4865014 4865015 1 -9031 9030 species 1 4926358 4926369 1 -9413 9412 species 1 4936848 4936849 1 -9544 9539 species 1 4948106 4948113 1 -9597 9596 species 1 4948688 4948689 1 -9598 9596 species 1 4948690 4948701 1 -9606 9605 species 1 4948704 4948709 1 -9612 9611 species 1 4942982 4943023 1 -9615 9612 subspecies 1 4942983 4942984 1 -9685 9682 species 1 4942587 4942588 1 -9796 9789 species 1 4941821 4941822 1 -9823 9822 species 1 4945820 4945851 1 -9913 9903 species 1 4945434 4945435 1 -9925 9922 species 1 4945068 4945071 1 -9940 9935 species 1 4945118 4945123 1 -9986 9984 species 1 4949258 4949263 1 -10029 10028 species 1 4956857 4956858 1 -10089 862507 species 1 4953572 4953573 1 -10090 862507 species 1 4953574 4953607 1 -10091 10090 subspecies 1 4953575 4953576 1 -10092 10090 subspecies 1 4953577 4953578 1 -10093 862508 species 1 4953660 4953661 1 -10096 862507 species 1 4953608 4953609 1 -10116 10114 species 1 4953915 4953918 1 -13616 13615 species 1 4932488 4932489 1 -29760 3603 species 1 1989578 1989583 1 -36329 5833 isolate 1 4998881 4998882 1 -37682 4480 species 1 1730742 1730751 1 -39442 10090 subspecies 1 4953581 4953582 1 -39946 4530 no rank 1 1724404 1724409 1 -39947 4530 no rank 1 1724410 1724417 1 -109376 3712 varietas 1 2087543 2087544 1 -112509 4513 subspecies 1 1729677 1729686 1 -200361 37682 subspecies 1 1730749 1730750 1 -511145 83333 no rank 1 576077 576078 1 -559292 4932 strain 1 2220367 2220368 1 -1736656 39947 no rank 1 1724411 1724412 1 -1736658 39947 no rank 1 1724415 1724416 1 -1736659 39946 no rank 1 1724405 1724406 1 diff --git a/src/tests/databases/ensembl_genome_metadata/organism.txt b/src/tests/databases/ensembl_genome_metadata/organism.txt deleted file mode 100644 index c0bdff55..00000000 --- a/src/tests/databases/ensembl_genome_metadata/organism.txt +++ /dev/null @@ -1,11 +0,0 @@ -1 511145 562 Escherichia coli K-12 K-12 substr. MG1655 Escherichia coli str. K-12 substr. MG1655 str. K12 SAMN02604091 E coli K 12 1e579f8d-3880-424e-9b4f-190eb69280d9 strain 0 -4 9606 9606 human Gambian in Western Division Homo sapiens SAMN17861670 Human 18bd7042-d861-4a10-b5d0-68c8bccfc87e population 0 -5 9606 9606 human Esan in Nigeria Homo sapiens SAMN17861241 Human a3352834-cea1-40aa-9dad-98581620c36b population 0 -6 9606 9606 human Gambian in Western Division Homo sapiens SAMN17861664 Human 87fb40f8-563b-4095-9fce-2bafa77ffba1 population 0 -7 9606 9606 human African from Barbados Homo sapiens SAMN13958415 Human 7f1653e1-9be5-4313-9fe9-800ae18d87b4 population 0 -9 9606 9606 human European Homo sapiens SAMN03283347 Human b0e689ba-889b-40af-8ab9-7675f9df79b6 population 0 -14 36329 5833 Malaria parasite \N Plasmodium falciparum 3D7 SAMN00102897 \N c867d142-85c4-4a5d-8361-b3f7f5fa3544 \N 0 -72 4565 4565 Bread wheat Chinese Spring Triticum aestivum SAMEA4791365 Wheat 86dd50f1-421e-4829-aca5-13ccc9a459f6 cultivar 0 -83 9606 9606 Human \N Homo sapiens SAMN12121739 Human 1d336185-affe-4a91-85bb-04ebd73cbb56 \N 999 -172 559292 4932 Baker's yeast S288C Saccharomyces cerevisiae S288c SAMEA3184125 Bakers yeast ae962453-0287-4201-83b8-3847c7d8027d strain 0 -175 6239 6239 Roundworm N2 Caenorhabditis elegans SAMN04256190 Roundworm b181947a-a725-4866-ada4-5433e5dfdcac strain 0 diff --git a/src/tests/databases/ensembl_genome_metadata/organism_group.txt b/src/tests/databases/ensembl_genome_metadata/organism_group.txt deleted file mode 100644 index 37527c46..00000000 --- a/src/tests/databases/ensembl_genome_metadata/organism_group.txt +++ /dev/null @@ -1,8 +0,0 @@ -1 Division EnsemblBacteria bacteria -2 Division EnsemblVertebrates vertebrates -3 Division EnsemblPlants plants -5 Division EnsemblProtists protists -9 Division EnsemblMetazoa metazoa -12 Division EnsemblFungi fungi -13 Internal Populars popular -14 Test EnsemblTest TestDivision diff --git a/src/tests/databases/ensembl_genome_metadata/organism_group_member.txt b/src/tests/databases/ensembl_genome_metadata/organism_group_member.txt deleted file mode 100644 index ddf7fa31..00000000 --- a/src/tests/databases/ensembl_genome_metadata/organism_group_member.txt +++ /dev/null @@ -1,17 +0,0 @@ -1 0 1 1 \N -4 0 4 2 \N -5 0 5 2 \N -6 0 6 2 \N -7 0 7 2 \N -9 0 9 2 \N -12 0 14 5 \N -67 0 72 3 \N -77 0 83 2 \N -159 0 172 12 \N -161 0 175 9 \N -193 0 83 13 1 -196 0 72 13 4 -215 0 172 13 23 -216 0 175 13 24 -233 0 1 13 41 -234 0 14 13 42 diff --git a/src/tests/databases/ensembl_genome_metadata/table.sql b/src/tests/databases/ensembl_genome_metadata/table.sql deleted file mode 100644 index 61540e45..00000000 --- a/src/tests/databases/ensembl_genome_metadata/table.sql +++ /dev/null @@ -1,283 +0,0 @@ -CREATE TABLE assembly -( - assembly_id int auto_increment primary key, - ucsc_name varchar(16) null, - accession varchar(16) not null, - level varchar(32) not null, - name varchar(128) not null, - accession_body varchar(32) null, - assembly_default varchar(128) null, - tol_id varchar(32) null, - created datetime(6) null, - ensembl_name varchar(255) null, - alt_accession varchar(16) null, - assembly_uuid char(36) not null, - is_reference tinyint(1) not null, - url_name varchar(128) null, - constraint accession - unique (accession), - constraint assembly_uuid - unique (assembly_uuid), - constraint ensembl_name - unique (ensembl_name) -); - -CREATE TABLE assembly_sequence -( - assembly_sequence_id int auto_increment primary key, - name varchar(128) null, - accession varchar(128) not null, - chromosomal tinyint(1) not null, - length int not null, - sequence_location varchar(10) null, - md5 varchar(32) null, - assembly_id int not null, - chromosome_rank int null, - sha512t24u varchar(128) null, - is_circular tinyint(1) not null, - type varchar(26) not null, - constraint assembly_sequence_assembly_id_accession_5f3e5119_uniq - unique (assembly_id, accession), - constraint assembly_sequence_assembly_id_2a84ddcb_fk_assembly_assembly_id - foreign key (assembly_id) references assembly (assembly_id) - on delete cascade -); - -create index assembly_sequence_assembly_id_chromosomal_index - on assembly_sequence (assembly_id, chromosomal); - -create index assembly_sequence_name_assembly_id_index - on assembly_sequence (name, assembly_id); - -CREATE TABLE attribute -( - attribute_id int auto_increment primary key, - name varchar(128) not null, - label varchar(128) not null, - description varchar(255) null, - type enum ('integer', 'float', 'percent', 'string', 'bp') default 'string' null, - required tinyint(1) DEFAULT '0' not null, - constraint name - unique (name), - constraint name_2 - unique (name), - constraint name_3 - unique (name) -); - -CREATE TABLE dataset_source -( - dataset_source_id int auto_increment primary key, - type varchar(32) not null, - name varchar(255) not null, - constraint name - unique (name) -); - -CREATE TABLE dataset_type -( - dataset_type_id int auto_increment primary key, - name varchar(32) not null, - label varchar(128) not null, - topic varchar(32) not null, - description varchar(255) null, - details_uri varchar(255) null, - parent_id int default null, - depends_on varchar(128) null, - filter_on longtext null, - constraint dataset_type_parent_id_fk - foreign key (parent_id) references dataset_type (dataset_type_id) - on delete set null -); - -CREATE TABLE dataset -( - dataset_id int auto_increment primary key, - dataset_uuid char(36) not null, - name varchar(128) not null, - version varchar(128) null, - created datetime(6) not null, - label varchar(128) not null, - dataset_source_id int not null, - dataset_type_id int not null, - status varchar(12) not null, - parent_id int default null, - constraint dataset_dataset_source_id_fd96f115_fk_dataset_s - foreign key (dataset_source_id) references dataset_source (dataset_source_id) - on delete cascade, - constraint dataset_dataset_type_id_47284562_fk_dataset_type_dataset_type_id - foreign key (dataset_type_id) references dataset_type (dataset_type_id), - constraint dataset_parent_id_fk - foreign key (parent_id) references dataset (dataset_id) - on delete cascade -); - -CREATE TABLE dataset_attribute -( - dataset_attribute_id int auto_increment primary key, - value varchar(128) null, - attribute_id int not null, - dataset_id int not null, - constraint dataset_attribute_dataset_id_attribute_id_value_4d1ddfaf_uniq - unique (dataset_id, attribute_id, value), - constraint dataset_attribute_attribute_id_55c51407_fk_attribute - foreign key (attribute_id) references attribute (attribute_id) - on delete cascade, - constraint dataset_attribute_dataset_id_2e2afe19_fk_dataset_dataset_id - foreign key (dataset_id) references dataset (dataset_id) - on delete cascade -); - -create index dataset_attribute_dataset_id_2e2afe19 - on dataset_attribute (dataset_id); - -CREATE TABLE ensembl_site -( - site_id int auto_increment primary key, - name varchar(64) not null, - label varchar(64) not null, - uri varchar(64) not null -); - -CREATE TABLE ensembl_release -( - release_id int auto_increment primary key, - version decimal(10, 1) not null, - release_date date null, - label varchar(64) null, - is_current tinyint(1) not null, - release_type varchar(16) not null, - site_id int null, - status varchar(12) not null, - name varchar(3) null, - constraint ensembl_release_version_site_id_b743399a_uniq - unique (version, site_id), - constraint ensembl_release_site_id_7c2f537a_fk_ensembl_site_site_id - foreign key (site_id) references ensembl_site (site_id) -); - -CREATE TABLE organism -( - organism_id int auto_increment primary key, - taxonomy_id int not null, - species_taxonomy_id int null, - common_name varchar(128) not null, - strain varchar(128) null, - scientific_name varchar(128) null, - biosample_id varchar(128) not null, - scientific_parlance_name varchar(255) null, - organism_uuid char(36) not null, - strain_type varchar(128) null, - `rank` int default 0 null, - constraint ensembl_name - unique (biosample_id), - constraint organism_uuid - unique (organism_uuid) -); -CREATE TABLE genome -( - genome_id int auto_increment - primary key, - genome_uuid char(36) not null, - created datetime(6) not null, - assembly_id int not null, - organism_id int not null, - is_best tinyint(1) default 0 not null, - production_name varchar(255) not null, - genebuild_version varchar(20) null, - genebuild_date varchar(20) null, - constraint genome_genome_uuid_6b62d0ad_uniq - unique (genome_uuid), - constraint genome_assembly_id_0a748388_fk_assembly_assembly_id - foreign key (assembly_id) references assembly (assembly_id) - on delete cascade, - constraint genome_organism_id_99ad7f35_fk_organism_organism_id - foreign key (organism_id) references organism (organism_id) - on delete cascade -); - - -CREATE TABLE genome_dataset -( - genome_dataset_id int auto_increment primary key, - is_current tinyint(1) not null, - dataset_id int not null, - genome_id int not null, - release_id int null, - constraint uk_genome_dataset UNIQUE KEY (dataset_id, genome_id), - constraint genome_dataset_dataset_id_0e9b7c99_fk_dataset_dataset_id - foreign key (dataset_id) references dataset (dataset_id) - on delete cascade, - constraint genome_dataset_genome_id_21d55a50_fk_genome_genome_id - foreign key (genome_id) references genome (genome_id) - on delete cascade, - constraint genome_dataset_release_id_1903f87c_fk_ensembl_release_release_id - foreign key (release_id) references ensembl_release (release_id) - on delete set null -); - -CREATE TABLE genome_release -( - genome_release_id int auto_increment primary key, - is_current tinyint(1) not null, - genome_id int not null, - release_id int not null, - constraint uk_genome_dataset UNIQUE KEY (release_id, genome_id), - constraint genome_release_genome_id_3e45dc04_fk_genome_genome_id - foreign key (genome_id) references genome (genome_id), - constraint genome_release_release_id_bca7e1e5_fk_ensembl_release_release_id - foreign key (release_id) references ensembl_release (release_id) -); - -CREATE TABLE organism_group -( - organism_group_id int auto_increment primary key, - type varchar(32) null, - name varchar(255) not null, - code varchar(48) null, - constraint code - unique (code), - constraint organism_group_type_name_170b6dae_uniq - unique (type, name) -); - -CREATE TABLE organism_group_member -( - organism_group_member_id int auto_increment primary key, - is_reference tinyint(1) null, - organism_id int not null, - organism_group_id int not null, - `order` int null, - constraint organism_group_member_organism_id_organism_gro_fe8f49ac_uniq - unique (organism_id, organism_group_id), - constraint organism_group_membe_organism_group_id_533ca128_fk_organism_ - foreign key (organism_group_id) references organism_group (organism_group_id) - on delete cascade, - constraint organism_group_membe_organism_id_2808252e_fk_organism_ - foreign key (organism_id) references organism (organism_id) - on delete cascade -); - -CREATE TABLE `ncbi_taxa_name` ( - `taxon_id` int(10) unsigned NOT NULL, - `name` varchar(500) NOT NULL, - `name_class` varchar(50) NOT NULL, - KEY `taxon_id` (`taxon_id`), - KEY `name` (`name`), - KEY `name_class` (`name_class`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; - -CREATE TABLE `ncbi_taxa_node` ( - `taxon_id` int(10) unsigned NOT NULL, - `parent_id` int(10) unsigned NOT NULL, - `rank` char(32) NOT NULL DEFAULT '', - `genbank_hidden_flag` tinyint(1) NOT NULL DEFAULT '0', - `left_index` int(10) NOT NULL DEFAULT '0', - `right_index` int(10) NOT NULL DEFAULT '0', - `root_id` int(10) NOT NULL DEFAULT '1', - PRIMARY KEY (`taxon_id`), - KEY `parent_id` (`parent_id`), - KEY `rank` (`rank`), - KEY `left_index` (`left_index`), - KEY `right_index` (`right_index`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; \ No newline at end of file diff --git a/src/tests/databases/ncbi_taxonomy.db b/src/tests/databases/ncbi_taxonomy.db new file mode 100644 index 00000000..b829fb4d Binary files /dev/null and b/src/tests/databases/ncbi_taxonomy.db differ diff --git a/src/tests/databases/ncbi_taxonomy/ncbi_taxa_name.txt b/src/tests/databases/ncbi_taxonomy/ncbi_taxa_name.txt deleted file mode 100644 index 5555cf0c..00000000 --- a/src/tests/databases/ncbi_taxonomy/ncbi_taxa_name.txt +++ /dev/null @@ -1,469 +0,0 @@ -562 Achromobacter sp. ATCC 35328 includes -562 ATCC 11775 type material -562 "Bacillus coli" Migula 1895 authority -562 Bacillus coli synonym -562 bacterium 10a includes -562 "Bacterium coli commune" Escherich 1885 authority -562 Bacterium coli commune synonym -562 "Bacterium coli" (Migula 1895) Lehmann and Neumann 1896 authority -562 Bacterium coli synonym -562 bacterium E3 includes -562 CCUG 24 type material -562 CCUG 29300 type material -562 CIP 54.8 type material -562 DSM 30083 type material -562 E. coli common name -562 Enterococcus coli synonym -562 Escherichia coli (Migula 1895) Castellani and Chalmers 1919 authority -562 Escherichia coli scientific name -562 Escherichia/Shigella coli equivalent name -562 Escherichia sp. 3_2_53FAA includes -562 Escherichia sp. MAR includes -562 IAM 12119 type material -562 JCM 1649 type material -562 LMG 2092 type material -562 LMG:2092 type material -562 NBRC 102203 type material -562 NCCB 54008 type material -562 NCTC 9001 type material -562 strain U5/41 type material -3702 Arabidopsis thaliana (L.) Heynh., 1842 authority -3702 Arabidopsis thaliana scientific name -3702 Arabis thaliana L., 1753 authority -3702 Arabis thaliana synonym -3702 mouse-ear cress common name -3702 thale-cress common name -3702 thale cress genbank common name -3708 Brassica napus L., 1753 authority -3708 Brassica napus scientific name -3708 oilseed rape common name -3708 rape genbank common name -3708 rapeseeds common name -3711 Brassica rapa L., 1753 authority -3711 Brassica rapa scientific name -3711 field mustard genbank common name -3712 Brassica oleracea L., 1753 authority -3712 Brassica oleracea scientific name -3712 wild cabbage genbank common name -3847 Glycine max (L.) Merr., 1917 authority -3847 Glycine max scientific name -3847 Phaseolus max L., 1753 authority -3847 Phaseolus max synonym -3847 soybean genbank common name -3847 soybeans common name -3880 barrel medic genbank common name -3880 Medicago truncatula Gaertn., 1790 authority -3880 Medicago truncatula scientific name -4081 Lycopersicon esculentum Mill. authority -4081 Lycopersicon esculentum synonym -4081 Lycopersicon esculentum var. esculentum synonym -4081 Solanum esculentum Dunal authority -4081 Solanum esculentum synonym -4081 Solanum lycopersicum L., 1753 authority -4081 Solanum lycopersicum scientific name -4081 Solanum lycopersicum var. humboldtii synonym -4081 tomato genbank common name -4113 potatoes common name -4113 potato genbank common name -4113 Solanum tuberosum L., 1753 authority -4113 Solanum tuberosum scientific name -4113 Solanum tuberosum subsp. tuberosum includes -4513 barley common name -4513 Hordeum vulgare L., 1753 authority -4513 Hordeum vulgare scientific name -4530 Asian cultivated rice genbank common name -4530 Oryza sativa L., 1753 authority -4530 Oryza sativa scientific name -4530 red rice common name -4530 rice common name -4558 Andropogon sorghum (L.) Brot. authority -4558 Andropogon sorghum synonym -4558 broomcorn common name -4558 milo common name -4558 Sorghum bicolor (L.) Moench, 1794 authority -4558 Sorghum bicolor scientific name -4558 Sorghum bicolor subsp. bicolor synonym -4558 sorghum genbank common name -4558 Sorghum nervosum Besser ex Schult. authority -4558 Sorghum nervosum synonym -4558 Sorghum saccharatum (L.) Moench authority -4558 Sorghum saccharatum synonym -4558 Sorghum vulgare Pers. authority -4558 Sorghum vulgare synonym -4565 bread wheat genbank common name -4565 Canadian hard winter wheat common name -4565 common wheat common name -4565 Triticum aestivum L., 1753 authority -4565 Triticum aestivum scientific name -4565 Triticum aestivum subsp. aestivum synonym -4565 Triticum vulgare synonym -4565 Triticum vulgare Vill., 1787 authority -4565 wheat common name -4567 durum wheat genbank common name -4567 Triticum durum Desf. authority -4567 Triticum durum ssp. durum synonym -4567 Triticum durum subsp. durum synonym -4567 Triticum durum synonym -4567 Triticum rigidum conv. durum synonym -4567 Triticum rigidum ssp. durum synonym -4567 Triticum rigidum var. durum synonym -4567 Triticum turgidum Durum Group synonym -4567 Triticum turgidum subsp. durum (Desf.) Husn., 1899 authority -4567 Triticum turgidum subsp. durum scientific name -4571 cone wheat common name -4571 English wheat common name -4571 poulard wheat common name -4571 rivet wheat common name -4571 Triticum aethiopicum Jakubz., 1947 authority -4571 Triticum aethiopicum synonym -4571 Triticum durum subsp. abyssinicum synonym -4571 Triticum durum subsp. abyssinicum Vavilov, 1931 authority -4571 Triticum turgidum L., 1753 authority -4571 Triticum turgidum scientific name -4577 maize common name -4577 Zea mays L., 1753 authority -4577 Zea mays scientific name -4577 Zea mays var. japonica synonym -4932 ATCC 18824 type material -4932 baker's yeast common name -4932 brewer's yeast genbank common name -4932 Candida robusta synonym -4932 CBS 1171 type material -4932 Mycoderma cerevisiae Desm., 1827 authority -4932 Mycoderma cerevisiae synonym -4932 NRRL Y-12632 type material -4932 Saccharomyces capensis synonym -4932 Saccharomyces cerevisiae (Desm.) Meyen, 1838 authority -4932 Saccharomyces cerevisiae scientific name -4932 Saccharomyces cerevisiae 'var. diastaticus' equivalent name -4932 Saccharomyces diastaticus J. Andrews & R.B. Gilliland ex Van der Walt, 1965 authority -4932 Saccharomyces diastaticus synonym -4932 Saccharomyces italicus synonym -4932 Saccharomyces oviformis synonym -4932 Saccharomyces uvarum var. melibiosus synonym -4932 specimen-voucher:NRRL:Y:12632 type material -5833 malaria parasite P. falciparum genbank common name -5833 Plasmodium falciparum scientific name -5833 Plasmodium (Laverania) falciparum synonym -6239 Caenorhabditis elegans (Maupas, 1900) authority -6239 Caenorhabditis elegans scientific name -6239 Rhabditis elegans Maupas, 1900 authority -6239 Rhabditis elegans synonym -7227 Diptera sp. DNAS-2A9-224646 includes -7227 Drosophila melanogaster Meigen, 1830 authority -7227 Drosophila melanogaster scientific name -7227 fruit fly genbank common name -7227 Sophophora melanogaster (Meigen, 1830) authority -7227 Sophophora melanogaster synonym -7955 Brachydanio rerio frankei synonym -7955 Brachydanio rerio synonym -7955 Cyprinus rerio Hamilton, 1822 authority -7955 Cyprinus rerio synonym -7955 Danio frankei synonym -7955 Danio rerio frankei synonym -7955 Danio rerio (Hamilton, 1822) authority -7955 Danio rerio scientific name -7955 leopard danio common name -7955 zebra danio common name -7955 zebrafish genbank common name -7955 zebra fish common name -7994 Astyanax mexicanus (De Filippi, 1853) authority -7994 Astyanax mexicanus scientific name -7994 blind cave fish common name -7994 Mexican tetra genbank common name -7994 Tetragonopterus mexicanus De Filippi, 1853 authority -7994 Tetragonopterus mexicanus synonym -8030 Atlantic salmon genbank common name -8030 Salmo salar Linnaeus, 1758 authority -8030 Salmo salar scientific name -8090 Japanese medaka genbank common name -8090 Japanese rice fish common name -8090 medaka common name -8090 Oryzias latipes scientific name -8090 Oryzias latipes (Temminck & Schlegel, 1846) authority -8090 Poecilia latipes synonym -8090 Poecilia latipes Temminck & Schlegel, 1846 authority -8128 Nile tilapia genbank common name -8128 Oreochromis nilotica synonym -8128 Oreochromis niloticus (Linnaeus, 1758) authority -8128 Oreochromis niloticus scientific name -8128 Perca nilotica Linnaeus, 1758 authority -8128 Perca nilotica synonym -8128 Tilapia nilotica synonym -8364 Silurana tropicalis Gray, 1864 authority -8364 Silurana tropicalis synonym -8364 tropical clawed frog genbank common name -8364 western clawed frog common name -8364 Xenopus laevis tropicalis synonym -8364 Xenopus (Silurana) tropicalis synonym -8364 Xenopus tropicalis (Gray, 1864) authority -8364 Xenopus tropicalis scientific name -9031 bantam common name -9031 chicken genbank common name -9031 chickens common name -9031 dwarf Leghorn chickens includes -9031 Gallus domesticus equivalent name -9031 Gallus gallus domesticus synonym -9031 Gallus gallus scientific name -9031 Phasianus gallus Linnaeus, 1758 authority -9031 Phasianus gallus synonym -9031 red junglefowl includes -9413 greater false vampire bat common name -9413 Indian false vampire genbank common name -9413 Megaderma lyra Saint-Hilaire, 1810 authority -9413 Megaderma lyra scientific name -9544 Cercopithecus mulatta synonym -9544 Cercopithecus mulatta Zimmermann, 1780 authority -9544 Macaca mulatta scientific name -9544 Macaca mulatta (Zimmermann, 1780) authority -9544 rhesus macaque common name -9544 rhesus macaques common name -9544 Rhesus monkey genbank common name -9544 rhesus monkeys common name -9597 bonobo common name -9597 Pan paniscus Schwarz, 1929 authority -9597 Pan paniscus scientific name -9597 pygmy chimpanzee genbank common name -9598 chimpanzee genbank common name -9598 Pan troglodytes scientific name -9598 Simia troglodytes Linnaeus, 1758 authority -9598 Simia troglodytes synonym -9606 Homo sapiens Linnaeus, 1758 authority -9606 Homo sapiens scientific name -9606 human genbank common name -9612 Canis lupus Linnaeus, 1758 authority -9612 Canis lupus scientific name -9612 gray wolf genbank common name -9612 grey wolf common name -9615 beagle dog includes -9615 beagle dogs includes -9615 Canis canis synonym -9615 Canis domesticus synonym -9615 Canis familiaris Linnaeus, 1758 authority -9615 Canis familiaris synonym -9615 Canis lupus familiaris Linnaeus, 1758 authority -9615 Canis lupus familiaris scientific name -9615 dog genbank common name -9615 dogs common name -9685 cat common name -9685 cats common name -9685 domestic cat genbank common name -9685 Felis catus Linnaeus, 1758 authority -9685 Felis catus scientific name -9685 Felis domesticus synonym -9685 Felis silvestris catus synonym -9685 Korat cats includes -9685 Korat cats L. authority -9796 domestic horse common name -9796 equine common name -9796 Equus caballus Linnaeus, 1758 authority -9796 Equus caballus scientific name -9796 Equus przewalskii f. caballus synonym -9796 Equus przewalskii forma caballus synonym -9796 horse genbank common name -9823 pig genbank common name -9823 pigs common name -9823 Sus scrofa Linnaeus, 1758 authority -9823 Sus scrofa scientific name -9823 swine common name -9823 wild boar common name -9913 Bos bovis synonym -9913 Bos primigenius taurus synonym -9913 Bos taurus Linnaeus, 1758 authority -9913 Bos taurus scientific name -9913 Bovidae sp. Adi Nefas includes -9913 bovine common name -9913 cattle genbank common name -9913 cow common name -9913 dairy cow common name -9913 domestic cattle common name -9913 domestic cow common name -9913 ox common name -9925 African dwarf goat includes -9925 African dwarf goats includes -9925 Capra aegagrus hircus synonym -9925 Capra hircus Linnaeus, 1758 authority -9925 Capra hircus scientific name -9925 domestic goat common name -9925 goat genbank common name -9925 goats common name -9925 Naine d'Afrique de l'Ouest includes -9940 domestic sheep common name -9940 lambs common name -9940 Ovis ammon aries synonym -9940 Ovis aries Linnaeus, 1758 authority -9940 Ovis aries scientific name -9940 Ovis orientalis aries synonym -9940 Ovis ovis synonym -9940 sheep genbank common name -9940 wild sheep common name -9986 domestic rabbit common name -9986 European rabbit common name -9986 Japanese white rabbit common name -9986 Lepus cuniculus Linnaeus, 1758 authority -9986 Lepus cuniculus synonym -9986 New Zealand rabbit includes -9986 Oryctolagus cuniculus scientific name -9986 rabbit genbank common name -9986 rabbits common name -10029 Chinese hamster genbank common name -10029 Chinese hamsters common name -10029 CHO cell lines includes -10029 Cricetulus aureus equivalent name -10029 Cricetulus barabensis griseus synonym -10029 Cricetulus griseus Milne-Edwards, 1867 authority -10029 Cricetulus griseus scientific name -10089 Mus caroli Bonhote, 1902 authority -10089 Mus caroli scientific name -10089 Mus formosanus Kuroda, 1925 authority -10089 Mus formosanus synonym -10089 ricefield mouse common name -10089 Ryukyu mouse genbank common name -10090 house mouse genbank common name -10090 LK3 transgenic mice includes -10090 mouse common name -10090 Mus musculus Linnaeus, 1758 authority -10090 Mus musculus scientific name -10090 Mus sp. 129SV includes -10090 nude mice includes -10090 transgenic mice includes -10091 Mus castaneus synonym -10091 Mus musculus castaneus scientific name -10091 Mus musculus castaneus Waterhouse, 1843 authority -10091 southeastern Asian house mouse genbank common name -10092 Mus domesticus synonym -10092 Mus musculus domesticus Schwarz & Scharz 1943 authority -10092 Mus musculus domesticus scientific name -10092 Mus musculus praetextus synonym -10092 Mus praetextus synonym -10092 western European house mouse genbank common name -10093 Coelomys parahi synonym -10093 Gairdner's shrew-mouse common name -10093 Gairdner's shrewmouse common name -10093 Mus pahari scientific name -10093 Mus pahari Thomas, 1916 authority -10093 shrew mouse genbank common name -10096 Algerian mouse common name -10096 Mus musculus spretus synonym -10096 Mus spretus Lataste, 1883 authority -10096 Mus spretus scientific name -10096 western wild mouse genbank common name -10116 brown rat common name -10116 Buffalo rat includes -10116 laboratory rat includes -10116 Mus norvegicus Berkenhout, 1769 authority -10116 Mus norvegicus synonym -10116 Norway rat genbank common name -10116 rat common name -10116 rats common name -10116 Rattus norvegicus scientific name -10116 Rattus PC12 clone IS includes -10116 Rattus sp. strain Wistar includes -10116 Sprague-Dawley rat includes -10116 Wistar rats includes -10116 zitter rats includes -13616 Didelphys domestica synonym -13616 Didelphys domestica Wagner, 1842 authority -13616 gray short-tailed opossum genbank common name -13616 Monodelphis domestica scientific name -29760 Vitis vinifera L., 1753 authority -29760 Vitis vinifera scientific name -29760 Vitis vinifera subsp. vinifera synonym -29760 wine grape genbank common name -36329 Plasmodium falciparum 3D7 scientific name -36329 Plasmodium falciparum (isolate 3D7) synonym -37682 Aegilops squarrosa subsp. squarrosa synonym -37682 Aegilops squarrosa synonym -37682 Aegilops tauschii Coss., 1849 authority -37682 Aegilops tauschii scientific name -37682 Patropyrum tauschii (Coss.) A.Love authority -37682 Patropyrum tauschii subsp. tauschii synonym -37682 Patropyrum tauschii synonym -37682 Triticum aegilops P.Beauv. ex Roem. & Schult. authority -37682 Triticum aegilops synonym -37682 Triticum tauschii (Coss.) Schmalh. authority -37682 Triticum tauschii synonym -39442 eastern European house mouse genbank common name -39442 Mus musculus hortulanus synonym -39442 Mus musculus musculus scientific name -39946 Indian rice common name -39946 Indica rice common name -39946 long-grained rice genbank common name -39946 Oryza sativa (indica cultivar-group) synonym -39946 Oryza sativa Indica Group scientific name -39946 Oryza sativa (indica group) synonym -39946 Oryza sativa subsp. indica Kato authority -39946 Oryza sativa subsp. indica synonym -39946 Oryza sp. Poi-6 includes -39947 Japanese rice genbank common name -39947 Japonica rice common name -39947 Oryza sativa (japonica cultivar-group) synonym -39947 Oryza sativa Japonica Group scientific name -39947 Oryza sativa subsp. japonica synonym -109376 Brassica oleracea subsp. oleracea synonym -109376 Brassica oleracea var. oleracea scientific name -112509 domesticated barley genbank common name -112509 Hordeum sativum Jess. authority -112509 Hordeum sativum synonym -112509 Hordeum vulgare subsp. vulgare scientific name -112509 Hordeum vulgare subsp. vulgare Spenn. authority -112509 Hordeum vulgare var. nudum Spenn. authority -112509 Hordeum vulgare var. nudum synonym -112509 Hordeum vulgare var. vulgare synonym -112509 two-rowed barley common name -200361 Aegilops tauschii subsp. strangulata (Eig) Tzvelev, 1973 authority -200361 Aegilops tauschii subsp. strangulata scientific name -511145 Escherichia coli MG1655 synonym -511145 Escherichia coli strain MG1655 equivalent name -511145 Escherichia coli str. K12 substr. MG1655 equivalent name -511145 Escherichia coli str. K-12 substr. MG1655 scientific name -511145 Escherichia coli str. MG1655 equivalent name -559292 Saccharomyces cerevisiae S288C scientific name -1736656 Oryza sativa (javanica cultivar-group) synonym -1736656 Oryza sativa tropical japonica cultivar-group synonym -1736656 Oryza sativa tropical japonica group synonym -1736656 Oryza sativa tropical japonica subgroup scientific name -1736656 Oryza sativa var. javanica Koern. authority -1736656 Oryza sativa var. javanica synonym -1736658 Oryza sativa Aromatic Japonica Group synonym -1736658 Oryza sativa aromatic subgroup scientific name -1736658 Oryza sativa Group V synonym -1736659 Oryza sativa aus cultivar-group synonym -1736659 Oryza sativa aus group synonym -1736659 Oryza sativa aus subgroup scientific name -1736659 Oryza sativa aus synonym -3711 3706 merged_taxon_id -37682 4482 merged_taxon_id -112509 4514 merged_taxon_id -4577 4578 merged_taxon_id -9940 9936 merged_taxon_id -9986 9985 merged_taxon_id -7955 27702 merged_taxon_id -9986 34833 merged_taxon_id -10116 36465 merged_taxon_id -9685 36475 merged_taxon_id -9544 36502 merged_taxon_id -3708 36503 merged_taxon_id -4513 36528 merged_taxon_id -7955 37966 merged_taxon_id -4565 39424 merged_taxon_id -37682 40669 merged_taxon_id -4932 41870 merged_taxon_id -9925 57076 merged_taxon_id -8128 61227 merged_taxon_id -37682 70688 merged_taxon_id -4571 77607 merged_taxon_id -10090 85055 merged_taxon_id -4113 90692 merged_taxon_id -10029 143285 merged_taxon_id -4081 195582 merged_taxon_id -10092 210727 merged_taxon_id -4565 235075 merged_taxon_id -9913 272461 merged_taxon_id -4530 389215 merged_taxon_id -562 469598 merged_taxon_id -562 662101 merged_taxon_id -562 662104 merged_taxon_id -562 1637691 merged_taxon_id -562 1806490 merged_taxon_id -7227 2267365 merged_taxon_id diff --git a/src/tests/databases/ncbi_taxonomy/ncbi_taxa_node.txt b/src/tests/databases/ncbi_taxonomy/ncbi_taxa_node.txt deleted file mode 100644 index 90b0b774..00000000 --- a/src/tests/databases/ncbi_taxonomy/ncbi_taxa_node.txt +++ /dev/null @@ -1,64 +0,0 @@ -562 561 species 1 576069 582930 1 -3702 3701 species 1 2083678 2083679 1 -3708 3705 species 1 2087484 2087491 1 -3711 3705 species 1 2087500 2087531 1 -3712 3705 species 1 2087532 2087565 1 -3847 1462606 species 1 2065183 2065184 1 -3880 3877 species 1 2056937 2056944 1 -4081 49274 species 1 1940956 1940959 1 -4113 4107 species 1 1940913 1940916 1 -4513 4512 species 1 1729674 1729689 1 -4530 4527 species 0 1724403 1724424 1 -4558 4557 species 1 1712790 1712793 1 -4565 4564 species 1 1730800 1730831 1 -4567 4571 subspecies 1 1730853 1730854 1 -4571 4564 species 1 1730852 1730867 1 -4577 4575 species 1 1712922 1712933 1 -4932 4930 species 1 2220338 2220971 1 -5833 418107 species 1 4998850 4999195 1 -6239 6237 species 1 2641603 2641604 1 -7227 32351 species 1 3734717 3734718 1 -7955 7954 species 1 4736348 4736349 1 -7994 7993 species 1 4744616 4744617 1 -8030 8028 species 1 4766921 4766922 1 -8090 8089 species 1 4802895 4802896 1 -8128 8139 species 1 4796928 4796935 1 -8364 8363 species 1 4865014 4865015 1 -9031 9030 species 1 4926358 4926369 1 -9413 9412 species 1 4936848 4936849 1 -9544 9539 species 1 4948106 4948113 1 -9597 9596 species 1 4948688 4948689 1 -9598 9596 species 1 4948690 4948701 1 -9606 9605 species 1 4948704 4948709 1 -9612 9611 species 1 4942982 4943023 1 -9615 9612 subspecies 1 4942983 4942984 1 -9685 9682 species 1 4942587 4942588 1 -9796 9789 species 1 4941821 4941822 1 -9823 9822 species 1 4945820 4945851 1 -9913 9903 species 1 4945434 4945435 1 -9925 9922 species 1 4945068 4945071 1 -9940 9935 species 1 4945118 4945123 1 -9986 9984 species 1 4949258 4949263 1 -10029 10028 species 1 4956857 4956858 1 -10089 862507 species 1 4953572 4953573 1 -10090 862507 species 1 4953574 4953607 1 -10091 10090 subspecies 1 4953575 4953576 1 -10092 10090 subspecies 1 4953577 4953578 1 -10093 862508 species 1 4953660 4953661 1 -10096 862507 species 1 4953608 4953609 1 -10116 10114 species 1 4953915 4953918 1 -13616 13615 species 1 4932488 4932489 1 -29760 3603 species 1 1989578 1989583 1 -36329 5833 isolate 1 4998881 4998882 1 -37682 4480 species 1 1730742 1730751 1 -39442 10090 subspecies 1 4953581 4953582 1 -39946 4530 no rank 1 1724404 1724409 1 -39947 4530 no rank 1 1724410 1724417 1 -109376 3712 varietas 1 2087543 2087544 1 -112509 4513 subspecies 1 1729677 1729686 1 -200361 37682 subspecies 1 1730749 1730750 1 -511145 83333 no rank 1 576077 576078 1 -559292 4932 strain 1 2220367 2220368 1 -1736656 39947 no rank 1 1724411 1724412 1 -1736658 39947 no rank 1 1724415 1724416 1 -1736659 39946 no rank 1 1724405 1724406 1 diff --git a/src/tests/databases/ncbi_taxonomy/table.sql b/src/tests/databases/ncbi_taxonomy/table.sql deleted file mode 100644 index 78777a3c..00000000 --- a/src/tests/databases/ncbi_taxonomy/table.sql +++ /dev/null @@ -1,23 +0,0 @@ -CREATE TABLE `ncbi_taxa_name` ( - `taxon_id` int(10) unsigned NOT NULL, - `name` varchar(500) NOT NULL, - `name_class` varchar(50) NOT NULL, - KEY `taxon_id` (`taxon_id`), - KEY `name` (`name`), - KEY `name_class` (`name_class`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; - -CREATE TABLE `ncbi_taxa_node` ( - `taxon_id` int(10) unsigned NOT NULL, - `parent_id` int(10) unsigned NOT NULL, - `rank` char(32) NOT NULL DEFAULT '', - `genbank_hidden_flag` tinyint(1) NOT NULL DEFAULT '0', - `left_index` int(10) NOT NULL DEFAULT '0', - `right_index` int(10) NOT NULL DEFAULT '0', - `root_id` int(10) NOT NULL DEFAULT '1', - PRIMARY KEY (`taxon_id`), - KEY `parent_id` (`parent_id`), - KEY `rank` (`rank`), - KEY `left_index` (`left_index`), - KEY `right_index` (`right_index`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; \ No newline at end of file diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index 82cdf350..b04eed88 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -101,11 +101,15 @@ def test_genebuild_workflow(self, test_dbs, dataset_factory): genebuild_uuid = 'a3352834-cea1-40aa-9dad-99981620c36b' # Test children creation with metadata_db.test_session_scope() as session: - genome = Genome(genebuild_version="1.0", - production_name="new_grch37", - assembly_id=40, - created=func.now(), - organism_id=9) + genome = Genome( + production_name="new_grch37", + assembly_id=40, + created=func.now(), + organism_id=9, + annotation_source="test", + genebuild_date="2026-04", + provider_name="test" + ) session.add(genome) genebuild = Dataset( dataset_type_id=2, @@ -185,9 +189,6 @@ def test_genebuild_workflow(self, test_dbs, dataset_factory): succeed_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == xref_uuid).one() assert succeed_status == succeed_status_check[0] - failed_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == protfeat_uuid).one() - assert failed_status == DatasetStatus.SUBMITTED # "Submitted" - assert failed_status_check[0] == DatasetStatus.SUBMITTED # "Submitted" # succeed on xref temp, succeed_status = dataset_factory.update_dataset_status(xref_uuid, DatasetStatus.PROCESSING, session=session) diff --git a/src/tests/test_exports_changelog.py b/src/tests/test_exports_changelog.py new file mode 100644 index 00000000..dd9f1d1b --- /dev/null +++ b/src/tests/test_exports_changelog.py @@ -0,0 +1,489 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +from pathlib import Path + +import pytest + +from ensembl.production.metadata.api.exports.changelog_generator import ChangelogGenerator +from ensembl.production.metadata.api.models import EnsemblRelease + +db_directory = Path(__file__).parent / 'databases' +db_directory = db_directory.resolve() + + +@pytest.mark.parametrize( + "test_dbs", + [[{'src': db_directory / "ensembl_genome_metadata"}, {'src': db_directory / "ncbi_taxonomy"}]], + indirect=True, +) +class TestChangelogGenerator: + """Test suite for ChangelogGenerator class.""" + + def test_init_valid_parameters(self, test_dbs): + """Test ChangelogGenerator initialization with valid parameters.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="2024-01" + ) + assert generator.metadata_db is not None + assert generator.release_label == "2024-01" + assert generator.output_path is None + + def test_init_with_output_path(self, test_dbs): + """Test ChangelogGenerator initialization with custom output path.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_path = "/tmp/test_changelog.csv" + + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="2024-01", + output_path=output_path + ) + assert generator.metadata_db is not None + assert generator.release_label == "2024-01" + assert generator.output_path == output_path + + def test_init_invalid_metadata_uri_empty(self, test_dbs): + """Test initialization fails with empty metadata URI.""" + with pytest.raises(ValueError) as excinfo: + ChangelogGenerator( + metadata_uri="", + release_label="2024-01" + ) + assert "metadata_uri must be a non-empty string" in str(excinfo.value) + + def test_init_invalid_metadata_uri_none(self, test_dbs): + """Test initialization fails with None metadata URI.""" + with pytest.raises(ValueError) as excinfo: + ChangelogGenerator( + metadata_uri=None, + release_label="2024-01" + ) + assert "metadata_uri must be a non-empty string" in str(excinfo.value) + + def test_init_invalid_metadata_uri_not_string(self, test_dbs): + """Test initialization fails with non-string metadata URI.""" + with pytest.raises(ValueError) as excinfo: + ChangelogGenerator( + metadata_uri=123, + release_label="2024-01" + ) + assert "metadata_uri must be a non-empty string" in str(excinfo.value) + + def test_init_invalid_release_label_empty(self, test_dbs): + """Test initialization fails with empty release label.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + + with pytest.raises(ValueError) as excinfo: + ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="" + ) + assert "release_label must be a non-empty string" in str(excinfo.value) + + def test_init_invalid_release_label_none(self, test_dbs): + """Test initialization fails with None release label.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + + with pytest.raises(ValueError) as excinfo: + ChangelogGenerator( + metadata_uri=metadata_uri, + release_label=None + ) + assert "release_label must be a non-empty string" in str(excinfo.value) + + def test_init_invalid_release_label_not_string(self, test_dbs): + """Test initialization fails with non-string release label.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + with pytest.raises(ValueError) as excinfo: + ChangelogGenerator( + metadata_uri=metadata_uri, + release_label=123 + ) + assert "release_label must be a non-empty string" in str(excinfo.value) + + def test_verify_release_exists(self, test_dbs): + """Test verify_release returns correct type for existing release.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + with test_dbs['ensembl_genome_metadata'].dbc.session_scope() as session: + release = session.query(EnsemblRelease).first() + if release: + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label=release.label + ) + release_type = generator.verify_release() + assert release_type in ['partial', 'integrated'] + assert release_type == release.release_type + + def test_verify_release_not_found(self, test_dbs): + """Test verify_release raises error for non-existent release.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="nonexistent-release-99999" + ) + with pytest.raises(ValueError) as excinfo: + generator.verify_release() + assert "Release not found" in str(excinfo.value) + assert "nonexistent-release-99999" in str(excinfo.value) + + def test_gather_partial_data_structure(self, test_dbs): + """Test gather_partial_data returns correct structure.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + + with test_dbs['ensembl_genome_metadata'].dbc.session_scope() as session: + partial_release = session.query(EnsemblRelease).filter( + EnsemblRelease.release_type == 'partial' + ).first() + + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label=partial_release.label + ) + + data = generator.gather_partial_data() + + assert isinstance(data, list) + first_entry = data[0] + required_keys = [ + 'scientific_name', 'common_name', 'assembly_name', + 'assembly_accession', 'annotation_provider', + 'geneset_updated', 'variation_updated', 'regulation_updated' + ] + for key in required_keys: + assert key in first_entry, f"Missing key: {key}" + assert isinstance(first_entry['scientific_name'], str) + assert first_entry['common_name'] is None or isinstance(first_entry['common_name'], str) + assert isinstance(first_entry['assembly_name'], str) + assert isinstance(first_entry['assembly_accession'], str) + assert first_entry['annotation_provider'] is None or isinstance(first_entry['annotation_provider'], + str) + assert isinstance(first_entry['geneset_updated'], int) + assert isinstance(first_entry['variation_updated'], int) + assert isinstance(first_entry['regulation_updated'], int) + assert first_entry['geneset_updated'] in [0, 1] + assert first_entry['variation_updated'] in [0, 1] + assert first_entry['regulation_updated'] in [0, 1] + + def test_gather_integrated_data_structure(self, test_dbs): + """Test gather_integrated_data returns correct structure.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + with test_dbs['ensembl_genome_metadata'].dbc.session_scope() as session: + integrated_release = session.query(EnsemblRelease).filter( + EnsemblRelease.release_type == 'integrated' + ).first() + if integrated_release: + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label=integrated_release.label + ) + data = generator.gather_integrated_data() + assert isinstance(data, list) + if len(data) > 0: + first_entry = data[0] + required_keys = [ + 'scientific_name', 'common_name', 'assembly_name', + 'assembly_accession', 'annotation_provider', + 'geneset_updated', 'variation_updated', 'regulation_updated', + 'status' + ] + for key in required_keys: + assert key in first_entry, f"Missing key: {key}" + assert first_entry['status'] in ['New', 'Removed', 'Updated', 'Unchanged'] + + def test_get_annotation_sources_bulk(self, test_dbs): + """Test _get_annotation_sources_bulk retrieves annotation sources.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + with test_dbs['ensembl_genome_metadata'].dbc.session_scope() as session: + partial_release = session.query(EnsemblRelease).filter( + EnsemblRelease.release_type == 'partial' + ).first() + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label=partial_release.label + ) + from ensembl.production.metadata.api.models import GenomeDataset + genome_ids = [gr.genome_id for gr in session.query(GenomeDataset.genome_id).filter( + GenomeDataset.release_id == partial_release.release_id + ).distinct().limit(5).all()] + if genome_ids: + annotation_sources = generator._get_annotation_sources_bulk( + session, genome_ids + ) + assert isinstance(annotation_sources, dict) + for genome_id in annotation_sources.keys(): + assert isinstance(genome_id, int) + for source in annotation_sources.values(): + assert source is None or isinstance(source, str) + + def test_get_annotation_sources_bulk_empty_list(self, test_dbs): + """Test _get_annotation_sources_bulk handles empty genome list.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + with test_dbs['ensembl_genome_metadata'].dbc.session_scope() as session: + partial_release = session.query(EnsemblRelease).filter( + EnsemblRelease.release_type == 'partial' + ).first() + if partial_release: + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label=partial_release.label + ) + annotation_sources = generator._get_annotation_sources_bulk(session, []) + assert isinstance(annotation_sources, dict) + assert len(annotation_sources) == 0 + + def test_export_to_csv_partial_release(self, test_dbs, tmp_path): + """Test export_to_csv creates file with correct structure for partial release.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_file = tmp_path / "test_changelog.csv" + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="2024-01", + output_path=str(output_file) + ) + + sample_data = [ + { + 'scientific_name': 'homo sapiens', + 'common_name': 'human', + 'assembly_name': 'GRCh38', + 'assembly_accession': 'GCA_000001405.15', + 'annotation_provider': 'Ensembl', + 'geneset_updated': 1, + 'variation_updated': 0, + 'regulation_updated': 1 + } + ] + generator.export_to_csv(sample_data) + + assert output_file.exists() + with open(output_file, 'r') as f: + lines = f.readlines() + assert lines[0].startswith('# Changelog for release') + assert '2024-01' in lines[0] + reader = csv.DictReader(lines[1:]) + rows = list(reader) + assert len(rows) == 1 + assert rows[0]['scientific_name'] == 'homo sapiens' + assert rows[0]['geneset_updated'] == '1' + assert rows[0]['variation_updated'] == '0' + assert 'status' not in rows[0] # Partial releases don't have status + + def test_export_to_csv_integrated_release(self, test_dbs, tmp_path): + """Test export_to_csv creates file with correct structure for integrated release.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_file = tmp_path / "test_changelog_integrated.csv" + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="112", + output_path=str(output_file) + ) + sample_data = [ + { + 'scientific_name': 'homo sapiens', + 'common_name': 'human', + 'assembly_name': 'GRCh38', + 'assembly_accession': 'GCA_000001405.15', + 'annotation_provider': 'Ensembl', + 'geneset_updated': '2024-01', + 'variation_updated': None, + 'regulation_updated': '2024-01', + 'status': 'Updated' + } + ] + generator.export_to_csv(sample_data) + assert output_file.exists() + with open(output_file, 'r') as f: + lines = f.readlines() + assert lines[0].startswith('# Changelog for release') + reader = csv.DictReader(lines[1:]) + rows = list(reader) + assert len(rows) == 1 + assert rows[0]['scientific_name'] == 'homo sapiens' + assert rows[0]['status'] == 'Updated' # Integrated releases have status + + def test_export_to_csv_default_output_path(self, test_dbs, tmp_path, monkeypatch): + """Test export_to_csv uses default output path when none specified.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + monkeypatch.chdir(tmp_path) + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="2024-01" + ) + sample_data = [ + { + 'scientific_name': 'test species', + 'common_name': 'test', + 'assembly_name': 'test', + 'assembly_accession': 'test', + 'annotation_provider': 'test', + 'geneset_updated': 0, + 'variation_updated': 0, + 'regulation_updated': 0 + } + ] + generator.export_to_csv(sample_data) + default_file = tmp_path / "2024-01.csv" + assert default_file.exists() + + def test_export_to_csv_empty_data(self, test_dbs, tmp_path): + """Test export_to_csv handles empty data correctly.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_file = tmp_path / "test_empty.csv" + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="2024-01", + output_path=str(output_file) + ) + generator.export_to_csv([]) + assert output_file.exists() + with open(output_file, 'r') as f: + lines = f.readlines() + assert lines[0].startswith('# Changelog for release') + assert len(lines) >= 2 + + def test_export_to_csv_creates_parent_directory(self, test_dbs, tmp_path): + """Test export_to_csv creates parent directories if they don't exist.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_file = tmp_path / "nested" / "directories" / "changelog.csv" + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="2024-01", + output_path=str(output_file) + ) + sample_data = [ + { + 'scientific_name': 'test', + 'common_name': 'test', + 'assembly_name': 'test', + 'assembly_accession': 'test', + 'annotation_provider': 'test', + 'geneset_updated': 0, + 'variation_updated': 0, + 'regulation_updated': 0 + } + ] + generator.export_to_csv(sample_data) + assert output_file.exists() + assert output_file.parent.exists() + + def test_generate_partial_release(self, test_dbs, tmp_path): + """Test generate method works end-to-end for partial release.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + with test_dbs['ensembl_genome_metadata'].dbc.session_scope() as session: + partial_release = session.query(EnsemblRelease).filter( + EnsemblRelease.release_type == 'partial' + ).first() + output_file = tmp_path / "test_generate.csv" + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label=partial_release.label, + output_path=str(output_file) + ) + generator.generate() + assert output_file.exists() + + def test_generate_integrated_release(self, test_dbs, tmp_path): + """Test generate method works end-to-end for integrated release.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + + with test_dbs['ensembl_genome_metadata'].dbc.session_scope() as session: + integrated_release = session.query(EnsemblRelease).filter( + EnsemblRelease.release_type == 'integrated' + ).first() + if integrated_release: + output_file = tmp_path / "test_generate_integrated.csv" + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label=integrated_release.label, + output_path=str(output_file) + ) + generator.generate() + assert output_file.exists() + + def test_generate_invalid_release(self, test_dbs, tmp_path): + """Test generate method raises error for invalid release.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_file = tmp_path / "test_invalid.csv" + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="invalid-release-999", + output_path=str(output_file) + ) + with pytest.raises(ValueError) as excinfo: + generator.generate() + assert "Release not found" in str(excinfo.value) + + def test_gather_partial_data_no_genomes(self, test_dbs): + """Test gather_partial_data returns empty list when no genomes found.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + with test_dbs['ensembl_genome_metadata'].dbc.session_scope() as session: + partial_release = session.query(EnsemblRelease).filter( + EnsemblRelease.release_type == 'partial' + ).first() + generator = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label=partial_release.label + ) + + data = generator.gather_partial_data() + assert isinstance(data, list) + + def test_csv_fieldnames_partial_vs_integrated(self, test_dbs, tmp_path): + """Test that CSV has different fieldnames for partial vs integrated releases.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + + # Partial release data + partial_file = tmp_path / "partial.csv" + generator_partial = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="2024-01", + output_path=str(partial_file) + ) + + partial_data = [{ + 'scientific_name': 'test', 'common_name': 'test', 'assembly_name': 'test', + 'assembly_accession': 'test', 'annotation_provider': 'test', + 'geneset_updated': 0, 'variation_updated': 0, 'regulation_updated': 0 + }] + + generator_partial.export_to_csv(partial_data) + + integrated_file = tmp_path / "integrated.csv" + generator_integrated = ChangelogGenerator( + metadata_uri=metadata_uri, + release_label="112", + output_path=str(integrated_file) + ) + + integrated_data = [{ + 'scientific_name': 'test', 'common_name': 'test', 'assembly_name': 'test', + 'assembly_accession': 'test', 'annotation_provider': 'test', + 'geneset_updated': '2024-01', 'variation_updated': None, 'regulation_updated': None, + 'status': 'New' + }] + + generator_integrated.export_to_csv(integrated_data) + + with open(partial_file, 'r') as f: + lines = f.readlines() + header = lines[1].strip() # Skip comment line + assert 'status' not in header + + with open(integrated_file, 'r') as f: + lines = f.readlines() + header = lines[1].strip() + assert 'status' in header diff --git a/src/tests/test_exports_json.py b/src/tests/test_exports_json.py new file mode 100644 index 00000000..5b65a859 --- /dev/null +++ b/src/tests/test_exports_json.py @@ -0,0 +1,321 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from datetime import datetime +from pathlib import Path + +import pytest + +from ensembl.production.metadata.api.exports.ftp_index import FTPMetadataExporter +from ensembl.production.metadata.api.models import Genome, ReleaseStatus + +db_directory = Path(__file__).parent / 'databases' +db_directory = db_directory.resolve() + + +@pytest.mark.parametrize( + "test_dbs", + [[{'src': db_directory / "ensembl_genome_metadata"}, {'src': db_directory / "ncbi_taxonomy"}]], + indirect=True, +) +class TestFTPMetadataExporter: + """Test suite for FTPMetadataExporter class.""" + + def test_init_valid_uri(self, test_dbs): + """Test FTPMetadataExporter initialization with valid metadata URI.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + assert exporter.metadata_db is not None + + def test_export_to_json_returns_dict(self, test_dbs): + """Test export_to_json returns dictionary when no output file specified.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + result = exporter.export_to_json() + assert isinstance(result, dict) + assert 'last_updated' in result + assert 'species' in result + assert isinstance(result['species'], dict) + + def test_export_to_json_creates_file(self, test_dbs, tmp_path): + """Test export_to_json creates file when output_file is specified.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + output_file = tmp_path / "ftp_metadata.json" + result = exporter.export_to_json(str(output_file)) + assert result is None + assert output_file.exists() + with open(output_file, 'r') as f: + data = json.load(f) + assert 'last_updated' in data + assert 'species' in data + + def test_build_ftp_metadata_json_structure(self, test_dbs): + """Test build_ftp_metadata_json returns correct structure.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + metadata = exporter.build_ftp_metadata_json() + assert isinstance(metadata, dict) + assert 'last_updated' in metadata + assert 'species' in metadata + assert isinstance(metadata['species'], dict) + first_species = next(iter(metadata['species'].values())) + assert 'assemblies' in first_species + assert isinstance(first_species['assemblies'], dict) + + def test_load_all_genome_data(self, test_dbs): + """Test _load_all_genome_data returns correct structure.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + with exporter.metadata_db.session_scope() as session: + genome_data = exporter._load_all_genome_data(session) + assert isinstance(genome_data, dict) + + first_genome_uuid = next(iter(genome_data.keys())) + first_genome_data = genome_data[first_genome_uuid] + assert 'genome' in first_genome_data + assert 'datasets' in first_genome_data + assert 'attributes' in first_genome_data + assert 'genebuild_metadata' in first_genome_data + assert isinstance(first_genome_data['datasets'], list) + assert isinstance(first_genome_data['attributes'], dict) + + @pytest.mark.parametrize( + ("input_name", "expected_name"), + [ + ("homo sapiens", "homo_sapiens"), + ("species.name", "species_name"), + ("species__name", "species_name"), + ("species___name", "species_name"), + ("homo. sapiens", "homo_sapiens"), + ("homo sapiens", "homo_sapiens"), + ("", ""), + ("homo_sapiens", "homo_sapiens"), + ("Homo. Sapiens", "Homo_Sapiens"), + ("homo sapiens", "homo_sapiens"), + (" homo sapiens ", "_homo_sapiens_"), + ], + ) + def test_normalize_species_name(self, test_dbs, input_name, expected_name): + """Test _normalize_species_name correctly normalizes species names.""" + metadata_uri = test_dbs["ensembl_genome_metadata"].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + assert exporter._normalize_species_name(input_name) == expected_name + + def test_extract_provider_from_path(self, test_dbs): + """Test _extract_provider_from_path extracts provider correctly.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + genebuild_metadata = { + 'genebuild_source_name': 'Ensembl' + } + assert exporter._extract_provider_from_path(genebuild_metadata) == 'ensembl' + genebuild_metadata = { + 'genebuild_source_name': 'REFSEQ' + } + assert exporter._extract_provider_from_path(genebuild_metadata) == 'refseq' + assert exporter._extract_provider_from_path(None) == 'unknown' + assert exporter._extract_provider_from_path({}) == 'unknown' + + def test_extract_genebuild_release_info(self, test_dbs): + """Test _extract_genebuild_release_info extracts release correctly.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + genebuild_metadata = { + 'last_geneset_update': '2024-01-01' + } + result = exporter._extract_genebuild_release_info(genebuild_metadata) + assert result['release'] == '2024_01' + genebuild_metadata = { + 'last_geneset_update': '2023-12-15' + } + result = exporter._extract_genebuild_release_info(genebuild_metadata) + assert result['release'] == '2023_12' + result = exporter._extract_genebuild_release_info(None) + assert result['release'] == 'unknown' + result = exporter._extract_genebuild_release_info({}) + assert result['release'] == 'unknown' + genebuild_metadata = { + 'last_geneset_update': 'invalid-date' + } + result = exporter._extract_genebuild_release_info(genebuild_metadata) + assert result['release'] == 'unknown' + + def test_extract_release_info_from_ensembl_release(self, test_dbs): + """Test _extract_release_info_from_ensembl_release extracts release correctly.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + with exporter.metadata_db.session_scope() as session: + genome = session.query(Genome).first() + + if genome: + result = exporter._extract_release_info_from_ensembl_release(genome) + + assert isinstance(result, dict) + assert 'release' in result + has_released = any( + gr.ensembl_release and gr.ensembl_release.status == ReleaseStatus.RELEASED + for gr in genome.genome_releases + ) + if has_released: + assert result['release'] != 'unknown' + + def test_has_released_dataset_bulk(self, test_dbs): + """Test _has_released_dataset_bulk correctly identifies dataset types.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + + datasets = [ + {'dataset_type_name': 'genebuild'}, + {'dataset_type_name': 'assembly'} + ] + assert exporter._has_released_dataset_bulk(datasets, 'genebuild') is True + assert exporter._has_released_dataset_bulk(datasets, 'assembly') is True + assert exporter._has_released_dataset_bulk(datasets, 'variation') is False + datasets = [ + {'dataset_type_name': 'regulatory_features'} + ] + assert exporter._has_released_dataset_bulk(datasets, 'regulation') is True + assert exporter._has_released_dataset_bulk([], 'genebuild') is False + + def test_get_dataset_file_paths_genebuild(self, test_dbs): + """Test _get_dataset_file_paths generates correct file paths for genebuild.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + base_path = "homo_sapiens/GRCh38/ensembl/geneset/2024_01" + with exporter.metadata_db.session_scope() as session: + genome = session.query(Genome).first() + assembly_data = {'accession': 'GRCh38'} if genome else {} + file_paths = exporter._get_dataset_file_paths( + base_path, 'genebuild', genome, assembly_data + ) + + assert 'annotations' in file_paths + assert 'cdna.fa.gz' in file_paths['annotations'] + assert 'genes.gff3.gz' in file_paths['annotations'] + assert 'genes.gtf.gz' in file_paths['annotations'] + assert 'pep.fa.gz' in file_paths['annotations'] + assert 'vep' in file_paths + assert 'genes.gff3.bgz' in file_paths['vep'] + + def test_get_dataset_file_paths_assembly(self, test_dbs): + """Test _get_dataset_file_paths generates correct file paths for assembly.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + base_path = "homo_sapiens/GRCh38/genome" + with exporter.metadata_db.session_scope() as session: + genome = session.query(Genome).first() + assembly_data = {'accession': 'GRCh38'} if genome else {} + file_paths = exporter._get_dataset_file_paths( + base_path, 'assembly', genome, assembly_data + ) + assert 'genome_sequences' in file_paths + assert 'chromosomes.tsv.gz' in file_paths['genome_sequences'] + assert 'hardmasked.fa.gz' in file_paths['genome_sequences'] + assert 'softmasked.fa.gz' in file_paths['genome_sequences'] + assert 'unmasked.fa.gz' in file_paths['genome_sequences'] + assert 'vep' in file_paths + assert 'softmasked.fa.bgz' in file_paths['vep'] + + def test_get_dataset_file_paths_variation(self, test_dbs): + """Test _get_dataset_file_paths generates correct file paths for variation.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + base_path = "homo_sapiens/GRCh38/ensembl/variation/2024_01" + with exporter.metadata_db.session_scope() as session: + genome = session.query(Genome).first() + assembly_data = {'accession': 'GRCh38'} if genome else {} + file_paths = exporter._get_dataset_file_paths( + base_path, 'variation', genome, assembly_data + ) + assert 'variation_data' in file_paths + assert 'variation.vcf.gz' in file_paths['variation_data'] + + def test_get_dataset_file_paths_regulation(self, test_dbs): + """Test _get_dataset_file_paths generates correct file paths for regulation.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + base_path = "homo_sapiens/GRCh38/ensembl/regulation" + with exporter.metadata_db.session_scope() as session: + genome = session.query(Genome).first() + assembly_data = {'accession': 'GRCh38'} if genome else {} + file_paths = exporter._get_dataset_file_paths( + base_path, 'regulation', genome, assembly_data + ) + + assert 'regulatory_features' in file_paths + assert 'regulation.gff' in file_paths['regulatory_features'] + + def test_get_dataset_file_paths_homologies(self, test_dbs): + """Test _get_dataset_file_paths generates correct file paths for homologies.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + base_path = "homo_sapiens/GRCh38/ensembl/homology/2024_01" + with exporter.metadata_db.session_scope() as session: + genome = session.query(Genome).first() + if genome: + assembly_data = {'accession': genome.assembly.accession} + file_paths = exporter._get_dataset_file_paths( + base_path, 'homologies', genome, assembly_data + ) + assert 'homology_data' in file_paths + homology_files = file_paths['homology_data'] + assert len(homology_files) > 0 + first_file = next(iter(homology_files.keys())) + assert 'homology.tsv.gz' in first_file + + def test_export_json_with_actual_data(self, test_dbs): + """Test export generates valid JSON structure with actual database data.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + metadata = exporter.export_to_json() + assert metadata is not None + assert 'last_updated' in metadata + assert 'species' in metadata + + for species_name, species_data in metadata['species'].items(): + assert isinstance(species_name, str) + assert 'assemblies' in species_data + for assembly_name, assembly_data in species_data['assemblies'].items(): + assert isinstance(assembly_name, str) + if 'providers' in assembly_data: + for provider_name, provider_data in assembly_data['providers'].items(): + assert isinstance(provider_name, str) + if 'releases' in provider_data: + for release_name, release_data in provider_data['releases'].items(): + assert isinstance(release_name, str) + if 'datasets' in release_data: + assert isinstance(release_data['datasets'], dict) + + def test_export_handles_empty_database(self, test_dbs): + """Test export handles database with no released genomes gracefully.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + metadata = exporter.export_to_json() + assert metadata is not None + assert 'last_updated' in metadata + assert 'species' in metadata + assert isinstance(metadata['species'], dict) + + def test_json_file_is_valid_json(self, test_dbs, tmp_path): + """Test that exported JSON file can be read back and is valid.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + exporter = FTPMetadataExporter(metadata_uri) + output_file = tmp_path / "test_output.json" + exporter.export_to_json(str(output_file)) + + with open(output_file, 'r') as f: + data = json.load(f) + assert 'last_updated' in data + assert 'species' in data + datetime.fromisoformat(data['last_updated']) diff --git a/src/tests/test_exports_stats.py b/src/tests/test_exports_stats.py new file mode 100644 index 00000000..d249555f --- /dev/null +++ b/src/tests/test_exports_stats.py @@ -0,0 +1,340 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +from pathlib import Path + +import pytest + +from ensembl.production.metadata.api.exports.stats_generator import StatsGenerator + +db_directory = Path(__file__).parent / 'databases' +db_directory = db_directory.resolve() + + +@pytest.mark.parametrize( + "test_dbs", + [[{'src': db_directory / "ensembl_genome_metadata"}, {'src': db_directory / "ncbi_taxonomy"}]], + indirect=True, +) +class TestStatsGenerator: + """Test suite for StatsGenerator class.""" + + def test_init_valid_uri(self, test_dbs): + """Test StatsGenerator initialization with valid metadata URI.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = StatsGenerator(metadata_uri) + assert generator.metadata_db is not None + assert generator.output_path == Path.cwd() + + def test_init_with_output_path(self, test_dbs, tmp_path): + """Test StatsGenerator initialization with custom output path.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_path = tmp_path / "test_output" + generator = StatsGenerator(metadata_uri, output_path=str(output_path)) + assert generator.metadata_db is not None + assert generator.output_path == output_path + assert output_path.exists() + + def test_init_invalid_uri_empty(self, test_dbs): + """Test StatsGenerator initialization fails with empty URI.""" + with pytest.raises(ValueError) as excinfo: + StatsGenerator("") + assert "metadata_uri must be a non-empty string" in str(excinfo.value) + + def test_init_invalid_uri_none(self, test_dbs): + """Test StatsGenerator initialization fails with None URI.""" + with pytest.raises(ValueError) as excinfo: + StatsGenerator(None) + assert "metadata_uri must be a non-empty string" in str(excinfo.value) + + def test_init_invalid_uri_not_string(self, test_dbs): + """Test StatsGenerator initialization fails with non-string URI.""" + with pytest.raises(ValueError) as excinfo: + StatsGenerator(123) + assert "metadata_uri must be a non-empty string" in str(excinfo.value) + + def test_get_partial_data(self, test_dbs): + """Test get_partial_data returns correct structure and values.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = StatsGenerator(metadata_uri) + partial_data = generator.get_partial_data() + assert isinstance(partial_data, list) + if len(partial_data) > 0: + first_release = partial_data[0] + required_keys = [ + 'release', 'new_genomes', 'total_genomes', + 'new_assemblies', 'total_assemblies', + 'new_variation_datasets', 'total_variation_datasets', + 'new_regulation_datasets', 'total_regulation_datasets' + ] + for key in required_keys: + assert key in first_release, f"Missing key: {key}" + assert isinstance(first_release['release'], str) + assert isinstance(first_release['new_genomes'], int) + assert isinstance(first_release['total_genomes'], int) + assert isinstance(first_release['new_assemblies'], int) + assert isinstance(first_release['total_assemblies'], int) + assert isinstance(first_release['new_variation_datasets'], int) + assert isinstance(first_release['total_variation_datasets'], int) + assert isinstance(first_release['new_regulation_datasets'], int) + assert isinstance(first_release['total_regulation_datasets'], int) + # Verify cumulative totals are non-decreasing + for i in range(1, len(partial_data)): + assert partial_data[i]['total_genomes'] >= partial_data[i - 1]['total_genomes'] + assert partial_data[i]['total_assemblies'] >= partial_data[i - 1]['total_assemblies'] + assert partial_data[i]['total_variation_datasets'] >= partial_data[i - 1]['total_variation_datasets'] + assert partial_data[i]['total_regulation_datasets'] >= partial_data[i - 1]['total_regulation_datasets'] + + assert len(partial_data) == 2 + assert partial_data[0]['release'] == '2020-10-18' + assert partial_data[0]['new_genomes'] == 3 + + def test_get_partial_data_specific_values(self, test_dbs): + """Test get_partial_data returns specific expected values from test database.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = StatsGenerator(metadata_uri) + partial_data = generator.get_partial_data() + + assert len(partial_data) == 2 + if len(partial_data) >= 1: + assert partial_data[0]['release'] == '2020-10-18' + assert partial_data[0]['new_genomes'] == 3 + assert partial_data[0]['total_genomes'] == 3 + + def test_get_integrated_data(self, test_dbs): + """Test get_integrated_data returns correct structure and values.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = StatsGenerator(metadata_uri) + integrated_data = generator.get_integrated_data() + + assert isinstance(integrated_data, list) + + if len(integrated_data) > 0: + first_release = integrated_data[0] + required_keys = [ + 'release', 'genomes', 'assemblies', + 'variation_datasets', 'regulation_datasets' + ] + for key in required_keys: + assert key in first_release, f"Missing key: {key}" + assert isinstance(first_release['release'], str) + assert isinstance(first_release['genomes'], int) + assert isinstance(first_release['assemblies'], int) + assert isinstance(first_release['variation_datasets'], int) + assert isinstance(first_release['regulation_datasets'], int) + + assert len(integrated_data) == 1 + assert integrated_data[0]['release'] == '2025-07' + assert integrated_data[0]['genomes'] == 10 + + def test_get_integrated_data_specific_values(self, test_dbs): + """Test get_integrated_data returns specific expected values from test database.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = StatsGenerator(metadata_uri) + + integrated_data = generator.get_integrated_data() + + assert len(integrated_data) == 1 + if len(integrated_data) >= 1: + assert integrated_data[0]['release'] == '2025-07' + assert integrated_data[0]['genomes'] == 10 + assert integrated_data[0]['assemblies'] == 10 + + def test_count_datasets(self, test_dbs): + """Test _count_datasets returns correct count for a specific release and dataset type.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = StatsGenerator(metadata_uri) + with generator.metadata_db.session_scope() as session: + release_id = 1 + variation_count = generator._count_datasets(session, release_id, 'variation') + assert variation_count == 3 + + regulation_count = generator._count_datasets(session, release_id, 'regulatory_features') + assert regulation_count == 0 + + def test_count_and_get_dataset_ids(self, test_dbs): + """Test _count_and_get_dataset_ids returns correct count and IDs.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = StatsGenerator(metadata_uri) + + with generator.metadata_db.session_scope() as session: + release_id = 1 + count, dataset_ids = generator._count_and_get_dataset_ids( + session, release_id, 'variation' + ) + + assert isinstance(count, int) + assert isinstance(dataset_ids, set) + assert count == len(dataset_ids) + assert count == 3 + + def test_export_to_csv(self, test_dbs, tmp_path): + """Test export_to_csv creates files with correct structure.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_path = tmp_path / "csv_output" + generator = StatsGenerator(metadata_uri, output_path=str(output_path)) + + # Create sample data + partial_data = [ + { + 'release': 'R1', + 'new_genomes': 10, + 'total_genomes': 10, + 'new_assemblies': 8, + 'total_assemblies': 8, + 'new_variation_datasets': 5, + 'total_variation_datasets': 5, + 'new_regulation_datasets': 3, + 'total_regulation_datasets': 3, + } + ] + + integrated_data = [ + { + 'release': 'R1', + 'genomes': 10, + 'assemblies': 8, + 'variation_datasets': 5, + 'regulation_datasets': 3, + } + ] + + generator.export_to_csv(partial_data, integrated_data) + + partial_file = output_path / 'stats.partial.csv' + integrated_file = output_path / 'stats.integrated.csv' + assert partial_file.exists() + assert integrated_file.exists() + with open(partial_file, 'r') as f: + reader = csv.DictReader(f) + rows = list(reader) + assert len(rows) == 1 + assert rows[0]['release'] == 'R1' + assert rows[0]['new_genomes'] == '10' + assert rows[0]['total_genomes'] == '10' + with open(integrated_file, 'r') as f: + reader = csv.DictReader(f) + rows = list(reader) + assert len(rows) == 1 + assert rows[0]['release'] == 'R1' + assert rows[0]['genomes'] == '10' + assert rows[0]['assemblies'] == '8' + + def test_export_to_csv_sorting(self, test_dbs, tmp_path): + """Test export_to_csv sorts data by release label.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_path = tmp_path / "csv_output_sorted" + generator = StatsGenerator(metadata_uri, output_path=str(output_path)) + partial_data = [ + {'release': 'R3', 'new_genomes': 30, 'total_genomes': 60, + 'new_assemblies': 20, 'total_assemblies': 50, + 'new_variation_datasets': 10, 'total_variation_datasets': 30, + 'new_regulation_datasets': 5, 'total_regulation_datasets': 15}, + {'release': 'R1', 'new_genomes': 10, 'total_genomes': 10, + 'new_assemblies': 8, 'total_assemblies': 8, + 'new_variation_datasets': 5, 'total_variation_datasets': 5, + 'new_regulation_datasets': 3, 'total_regulation_datasets': 3}, + {'release': 'R2', 'new_genomes': 20, 'total_genomes': 30, + 'new_assemblies': 12, 'total_assemblies': 20, + 'new_variation_datasets': 5, 'total_variation_datasets': 10, + 'new_regulation_datasets': 2, 'total_regulation_datasets': 5}, + ] + + generator.export_to_csv(partial_data, []) + partial_file = output_path / 'stats.partial.csv' + assert partial_file.exists() + + with open(partial_file, 'r') as f: + reader = csv.DictReader(f) + rows = list(reader) + assert len(rows) == 3 + assert rows[0]['release'] == 'R1' + assert rows[1]['release'] == 'R2' + assert rows[2]['release'] == 'R3' + + def test_export_to_csv_empty_data(self, test_dbs, tmp_path): + """Test export_to_csv handles empty data correctly.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_path = tmp_path / "csv_output_empty" + generator = StatsGenerator(metadata_uri, output_path=str(output_path)) + generator.export_to_csv([], []) + + partial_file = output_path / 'stats.partial.csv' + integrated_file = output_path / 'stats.integrated.csv' + + assert partial_file.exists() + assert integrated_file.exists() + with open(partial_file, 'r') as f: + reader = csv.DictReader(f) + rows = list(reader) + assert len(rows) == 0 + + with open(integrated_file, 'r') as f: + reader = csv.DictReader(f) + rows = list(reader) + assert len(rows) == 0 + + def test_generate_integration(self, test_dbs, tmp_path): + """Test generate method integrates all components correctly.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + output_path = tmp_path / "generate_output" + generator = StatsGenerator(metadata_uri, output_path=str(output_path)) + generator.generate() + partial_file = output_path / 'stats.partial.csv' + integrated_file = output_path / 'stats.integrated.csv' + + assert partial_file.exists() + assert integrated_file.exists() + with open(partial_file, 'r') as f: + reader = csv.DictReader(f) + assert reader.fieldnames is not None + partial_fieldnames = [ + 'release', 'new_genomes', 'total_genomes', + 'new_assemblies', 'total_assemblies', + 'new_variation_datasets', 'total_variation_datasets', + 'new_regulation_datasets', 'total_regulation_datasets' + ] + assert reader.fieldnames == partial_fieldnames + + with open(integrated_file, 'r') as f: + reader = csv.DictReader(f) + assert reader.fieldnames is not None + integrated_fieldnames = [ + 'release', 'genomes', 'assemblies', + 'variation_datasets', 'regulation_datasets' + ] + assert reader.fieldnames == integrated_fieldnames + + def test_partial_data_ordering(self, test_dbs): + """Test that partial data is returned in correct order by release label.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = StatsGenerator(metadata_uri) + partial_data = generator.get_partial_data() + if len(partial_data) > 1: + release_labels = [item['release'] for item in partial_data] + assert release_labels == sorted(release_labels) + # Assert that partial data exists and is not empty + assert partial_data is not None, "Partial data should not be None" + assert len(partial_data) >= 1, "Partial data should contain at least one item" + + def test_integrated_data_ordering(self, test_dbs): + """Test that integrated data is returned in correct order by release label.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + generator = StatsGenerator(metadata_uri) + integrated_data = generator.get_integrated_data() + if len(integrated_data) > 1: + release_labels = [item['release'] for item in integrated_data] + assert release_labels == sorted(release_labels) + if len(integrated_data) == 0: + release_labels = [item['release'] for item in integrated_data] + assert release_labels == [] diff --git a/src/tests/test_genome_factory.py b/src/tests/test_genome_factory.py index c8e99a56..d5742b60 100644 --- a/src/tests/test_genome_factory.py +++ b/src/tests/test_genome_factory.py @@ -95,7 +95,7 @@ def test_fetch_genomes_by_genome_uuid(self, test_dbs, genome_factory, genome_fil metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: - genome = session.query(Genome).filter(Genome.genome_uuid == genome_filters['genome_uuid']).one() + genome = session.query(Genome).filter(Genome.genome_uuid == genome_filters['genome_uuid'][0]).one() assert genome_factory_result['genome_uuid'] == genome_filters['genome_uuid'][0] assert genome.genome_uuid == genome_filters['genome_uuid'][0] assert genome.genome_uuid == genome_factory_result['genome_uuid'] @@ -112,7 +112,7 @@ def test_fetch_genomes_by_dataset_uuid(self, test_dbs, genome_factory, genome_fi assert genome_factory_result is not None metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: - dataset = session.query(Dataset).filter(Dataset.dataset_uuid == genome_filters['dataset_uuid']).one() + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == genome_filters['dataset_uuid'][0]).one() assert genome_factory_result['dataset_uuid'] == genome_filters['dataset_uuid'][0] assert dataset.dataset_uuid == genome_filters['dataset_uuid'][0] @@ -124,7 +124,7 @@ def test_fetch_genomes_by_default_status_submitted(self, test_dbs, genome_factor metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: dataset: Dataset = session.query(Dataset).filter( - Dataset.dataset_uuid == genome_filters['dataset_uuid']).one() + Dataset.dataset_uuid == genome_filters['dataset_uuid'][0]).one() assert genome_factory_result['dataset_uuid'] == genome_filters['dataset_uuid'][0] assert dataset.dataset_uuid == genome_filters['dataset_uuid'][0] assert dataset.status.value == genome_factory_result['dataset_status'] @@ -143,15 +143,15 @@ def test_update_dataset_status_submitted_processing_processed_released(self, tes # fetch genomes by status submitted and update to processing genome_factory_result = [genome for genome in genome_factory.get_genomes(**genome_filters)][0] - logger.debug(f"Factory Results 1 {genome_factory_result}") + # logger.debug(f"Factory Results 1 {genome_factory_result}") metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: # check genebuild one has been updated to Processing as well dataset: Dataset = session.query(Dataset).filter(Dataset.dataset_uuid == genebuild_uuid).one() - logger.debug(f"Dataset 1 {dataset}") + # logger.debug(f"Dataset 1 {dataset}") assert genome_factory_result['updated_dataset_status'] == dataset.status.value dataset: Dataset = session.query(Dataset).filter(Dataset.dataset_uuid == leaf_uuid).one() - logger.debug(f"Dataset 1 {dataset}") + # logger.debug(f"Dataset 1 {dataset}") assert genome_factory_result['updated_dataset_status'] == dataset.status.value # update dataset status to processed @@ -160,13 +160,13 @@ def test_update_dataset_status_submitted_processing_processed_released(self, tes # fetch genomes by status processing and update to processed genome_factory_result = [genome for genome in genome_factory.get_genomes(**genome_filters)][0] - logger.debug(f"Factory Results 2 {genome_factory_result}") + # logger.debug(f"Factory Results 2 {genome_factory_result}") with metadata_db.session_scope() as session: dataset = session.query(Dataset).filter(Dataset.dataset_uuid == genebuild_uuid).one() - logger.debug(f"Dataset 2 {dataset}") + # logger.debug(f"Dataset 2 {dataset}") assert 'Processing' == dataset.status.value dataset = session.query(Dataset).filter(Dataset.dataset_uuid == leaf_uuid).one() - logger.debug(f"Dataset 2b {dataset}") + # logger.debug(f"Dataset 2b {dataset}") assert genome_factory_result['updated_dataset_status'] == dataset.status.value # update dataset status to processed @@ -176,11 +176,11 @@ def test_update_dataset_status_submitted_processing_processed_released(self, tes # fetch genomes by status processed and update to released with pytest.raises(DatasetFactoryException): genome_factory_result = [genome for genome in genome_factory.get_genomes(**genome_filters)][0] - logger.debug(f"Factory Results 3 {genome_factory_result}") + # logger.debug(f"Factory Results 3 {genome_factory_result}") # assert nothing happened in DB with metadata_db.session_scope() as session: dataset = session.query(Dataset).filter(Dataset.dataset_uuid == leaf_uuid).one() - logger.debug(f"Dataset 3 {dataset}") + # logger.debug(f"Dataset 3 {dataset}") assert 'Processed' == dataset.status.value # TODO complete the test with all sub datasets updated to processed before moving leaf to # release then asses that genebuild is now released @@ -201,3 +201,4 @@ def test_expected_columns_on_update_status(self, genome_factory, expected_column expected_columns.append('updated_dataset_status') returned_columns = list(next(genome_factory.get_genomes(**genome_filters)).keys()) assert returned_columns.sort() == expected_columns.sort() + diff --git a/src/tests/test_grpc_release.py b/src/tests/test_grpc_release.py index 74a8660d..fe0b191b 100644 --- a/src/tests/test_grpc_release.py +++ b/src/tests/test_grpc_release.py @@ -60,13 +60,13 @@ def test_fetch_all_releases(self, release_conn, allow_unreleased, expected_count logger.debug("Results: %s", releases) assert len(releases) == expected_count assert [release.EnsemblSite.name == 'Ensembl' for release in releases] - assert releases[1].EnsemblRelease.label == 'MVP Beta-1' + assert releases[1].EnsemblRelease.label == '2020-10-18' @pytest.mark.parametrize( "allow_unreleased, genome_uuid, release_name", [ - (False, 'a73351f7-93e7-11ec-a39d-005056b38ce3', 'First Beta'), - (True, '75b7ac15-6373-4ad5-9fb7-23813a5355a4', 'MVP Beta-2') + (False, 'a73351f7-93e7-11ec-a39d-005056b38ce3', '2023-06-15'), + (True, '75b7ac15-6373-4ad5-9fb7-23813a5355a4', '2021-10-18') ], indirect=['allow_unreleased'] ) @@ -84,9 +84,10 @@ def test_fetch_releases_for_genome(self, release_conn, allow_unreleased, genome_ "allow_unreleased, dataset_uuid, release_name, release_status", [ (False, '8801edaf-86ec-4799-8fd4-a59077f04c05', None, None), # No release returned is not allowed - (False, '08543d8d-2110-46f3-a9b6-ac58c4af8202', 'MVP Beta-1', 'Released'), # No release returned is not allowed - (True, 'd57040b6-0ef5-4e6b-97ef-be0ad94d3a61', 'MVP Beta-2', 'Prepared'), # Processed Beta-2 - (True, 'd641779c-2add-46ce-acf4-a2b6f15274b1', 'MVP Beta-3', 'Preparing'), # Processed Beta-2 + (False, '08543d8d-2110-46f3-a9b6-ac58c4af8202', '2020-10-18', 'Released'), + # No release returned is not allowed + (True, 'd57040b6-0ef5-4e6b-97ef-be0ad94d3a61', '2021-10-18', 'Prepared'), # Processed Beta-2 + (True, 'd641779c-2add-46ce-acf4-a2b6f15274b1', '2022-10-18', 'Preparing'), # Processed Beta-2 ], indirect=['allow_unreleased'] ) diff --git a/src/tests/test_organism_to_organismgroup.py b/src/tests/test_organism_to_organismgroup.py deleted file mode 100644 index 102276bf..00000000 --- a/src/tests/test_organism_to_organismgroup.py +++ /dev/null @@ -1,90 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import re -from pathlib import Path -import pytest -from collections import namedtuple -from ensembl.production.metadata.api.models import OrganismGroup, Genome, Organism, OrganismGroupMember -from ensembl.production.metadata.scripts.organism_to_organismgroup import process_genomes, \ - create_or_remove_organism_group -from ensembl.utils.database import UnitTestDB, DBConnection - -db_directory = Path(__file__).parent / 'databases' -db_directory = db_directory.resolve() - -# Define a named tuple for script args -Args = namedtuple('Args', [ - 'metadata_db_uri', 'core_server_uri', 'organism_group_type', - 'organism_group_name', 'genome_uuid', 'release_id', 'remove', 'raise_error' -]) - - -@pytest.mark.parametrize("test_dbs", [[{'src': Path(__file__).parent / "databases/ensembl_genome_metadata"}, - {'src': Path(__file__).parent / "databases/ncbi_taxonomy"}, - {'src': Path(__file__).parent / "databases/core_1"}, - ]], - indirect=True) -class TestAddOrRemoveOrganismGroup: - dbc = None - - @pytest.mark.parametrize( - "genome_uuids, organism_group_type, organism_group_name, release_id, remove", - [ - ('a7335667-93e7-11ec-a39d-005056b38ce3', 'Test', 'EnsemblTest', '', False), - ('a7335667-93e7-11ec-a39d-005056b38ce3', 'Test', 'EnsemblTest', '', True), - - ] - ) - def test_add_organismgroup(self, test_dbs, genome_uuids, organism_group_type, organism_group_name, release_id, - remove): - metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) - args = Args( - metadata_db_uri=test_dbs['ensembl_genome_metadata'].dbc.url, - core_server_uri=None, - organism_group_type='Test', - organism_group_name='EnsemblTest', - genome_uuid=['a7335667-93e7-11ec-a39d-005056b38ce3'], - release_id=[], - remove=remove, - raise_error=False - ) - - # Mock the database connection - with metadata_db.session_scope() as session: - organism_group = session.query(OrganismGroup).filter( - OrganismGroup.name == args.organism_group_name, - OrganismGroup.type == args.organism_group_type - ).one_or_none() - - organism_group_id = organism_group.organism_group_id if organism_group else None - assert organism_group_id is not None - process_genomes(session, args, organism_group_id=organism_group_id) - session.commit() - # Check if the organism group was added - query = ( - session.query(Genome, Organism, OrganismGroup).join(Organism, Organism.organism_id == Genome.organism_id - ).join(OrganismGroupMember, - OrganismGroupMember.organism_id == Organism.organism_id - ).join(OrganismGroup, - OrganismGroup.organism_group_id == OrganismGroupMember.organism_group_id - ).filter( - Genome.genome_uuid.in_([args.genome_uuid]), - OrganismGroup.name == args.organism_group_name, - ) - ) - if remove: - assert query.count() == 0, "Organism group member should be removed" - else: - assert query.count() > 0 - for genome, organism, organism_group in query.all(): - assert organism_group.name == args.organism_group_name, f"Expected {args.organism_group_name}, got {organism_group.name}" - assert organism_group.type == args.organism_group_type, f"Expected {args.organism_group_type}, got {organism_group.type}" diff --git a/src/tests/test_protobuf_msg_factory.py b/src/tests/test_protobuf_msg_factory.py index 1244b6da..f2c3836f 100644 --- a/src/tests/test_protobuf_msg_factory.py +++ b/src/tests/test_protobuf_msg_factory.py @@ -106,44 +106,25 @@ def test_create_stats_by_organism_uuid(self, genome_conn): organism_uuid = "1e579f8d-3880-424e-9b4f-190eb69280d9" input_data = genome_conn.fetch_genome_datasets(organism_uuid=organism_uuid, dataset_type_name="all") - first_expected_stat = { - 'label': 'assembly.accession', - 'name': 'assembly.accession', - 'statisticType': 'string', - 'statisticValue': 'GCA_000005845.2' - } output = json_format.MessageToJson(msg_factory.create_stats_by_genome_uuid(input_data)[0]) - assert json.loads(output)['genomeUuid'] == "a73351f7-93e7-11ec-a39d-005056b38ce3" - # check the first stat info of the first genome_uuid - # print(json.loads(output)['statistics']) - assert json.loads(output)['statistics'][0] == first_expected_stat + output_dict = json.loads(output) - def test_create_top_level_statistics(self, genome_conn): - # ecoli - organism_uuid = "1e579f8d-3880-424e-9b4f-190eb69280d9" - input_data = genome_conn.fetch_genome_datasets(organism_uuid=organism_uuid, dataset_type_name="all") + assert output_dict["genomeUuid"] == "a73351f7-93e7-11ec-a39d-005056b38ce3" + + # Don't assume order - search for the specific statistic + stats = output_dict["statistics"] + assembly_accession_stat = next( + (s for s in stats if s["name"] == "assembly.accession"), + None + ) - first_expected_stat = { + assert assembly_accession_stat is not None, "assembly.accession statistic not found" + assert assembly_accession_stat == { 'label': 'assembly.accession', 'name': 'assembly.accession', 'statisticType': 'string', 'statisticValue': 'GCA_000005845.2' } - stats_by_genome_uuid = msg_factory.create_stats_by_genome_uuid(input_data) - - output = json_format.MessageToJson( - msg_factory.create_top_level_statistics({ - 'organism_uuid': organism_uuid, - 'stats_by_genome_uuid': stats_by_genome_uuid - }) - ) - output_dict = json.loads(output) - assert 'organismUuid' in output_dict.keys() and 'statsByGenomeUuid' in output_dict.keys() - # These tests are pain in the back - # TODO: find a way to improve this spaghetti - assert output_dict["organismUuid"] == "1e579f8d-3880-424e-9b4f-190eb69280d9" - assert output_dict['statsByGenomeUuid'][0]['genomeUuid'] == "a73351f7-93e7-11ec-a39d-005056b38ce3" - assert output_dict['statsByGenomeUuid'][0]['statistics'][0] == first_expected_stat def test_create_genome_sequence(self, genome_conn): input_data = genome_conn.fetch_sequences(genome_uuid="a7335667-93e7-11ec-a39d-005056b38ce3") @@ -196,7 +177,7 @@ def test_create_genome_assembly_sequence_region(self, genome_conn): (False, 108.0, { "releaseVersion": 108.0, "releaseDate": "2023-06-15", - "releaseLabel": "First Beta", + "releaseLabel": "2023-06-15", "releaseType": "partial", "isCurrent": False, "siteName": "Ensembl", @@ -205,8 +186,8 @@ def test_create_genome_assembly_sequence_region(self, genome_conn): }), (False, 110.1, { "releaseVersion": 110.1, - "releaseDate": "2023-10-18", - "releaseLabel": "MVP Beta-1", + "releaseDate": "2020-10-18", + "releaseLabel": "2020-10-18", "releaseType": "partial", "isCurrent": True, "siteName": "Ensembl", @@ -215,8 +196,8 @@ def test_create_genome_assembly_sequence_region(self, genome_conn): }), (True, 110.3, { "releaseVersion": 110.3, - "releaseDate": "Unreleased", - "releaseLabel": "MVP Beta-3", + "releaseDate": "2022-10-18", + "releaseLabel": "2022-10-18", "releaseType": "partial", "isCurrent": False, "siteName": "Ensembl", @@ -267,16 +248,9 @@ def test_create_organisms_group_count(self, genome_conn, expected_count, allow_u "genome_tag, current_only, expected_output", [ # url_name = GRCh38 => homo_sapien 38 - ("GRCh38", True, {'genomeUuid': 'a7335667-93e7-11ec-a39d-005056b38ce3'}), - #Todo: Need to review how genomes are fetched from release version (minor revision) - #genome_select = genome_select.filter(EnsemblRelease.version <= release_version) - #if a genome is assigned to 110.1 & 108.0 and current release version is 110.3 - #the return should be ordered to its genome last release version 110.1 - ("GRCh38", False, {"genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3"}), - # tol_id = mHomSap1 => homo_sapien 37 - # I randomly picked up this tol_id, probably wrong (biologically speaking) - ("GRCh37", False, {"genomeUuid": "3704ceb1-948d-11ec-a39d-005056b38ce3"}), - # Null + ("grch38", True, {'genomeUuid': 'a7335667-93e7-11ec-a39d-005056b38ce3'}), + ("grch38", False, {"genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3"}), + ("grch37", False, {"genomeUuid": "3704ceb1-948d-11ec-a39d-005056b38ce3"}), ("iDontExist", False, {}), ] ) diff --git a/src/tests/test_release_factory.py b/src/tests/test_release_factory.py index 56444403..5ef93665 100644 --- a/src/tests/test_release_factory.py +++ b/src/tests/test_release_factory.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from datetime import datetime from decimal import Decimal from pathlib import Path @@ -19,6 +20,7 @@ from ensembl.production.metadata.api.exceptions import MissingMetaException from ensembl.production.metadata.api.factories.genomes import GenomeFactory from ensembl.production.metadata.api.factories.release import ReleaseFactory +from ensembl.production.metadata.api.factories.utils import get_genome_sets_by_assembly_and_provider from ensembl.production.metadata.api.models import * logger = logging.getLogger(__name__) @@ -41,21 +43,18 @@ def test_init_release_default(self, test_dbs) -> None: with metadata_db.session_scope() as session: last_release = session.query(EnsemblRelease).order_by(EnsemblRelease.version.desc()).first() expected_version = Decimal("1.0") if last_release is None else last_release.version + Decimal("0.1") + label = "2028-09-11" + date = datetime.strptime(label, "%Y-%m-%d").date() + + factory.init_release(label=label) - try: - # Call init_release but don't assert on the returned object - factory.init_release(label=str(expected_version)) - except Exception as e: - pytest.fail(f"Unexpected exception: {e}") - # ✅ Re-fetch in a new session with metadata_db.session_scope() as session: release = session.query(EnsemblRelease).filter(EnsemblRelease.version == expected_version).one_or_none() - assert release is not None, "Release was not inserted into the database" assert release.version == expected_version - assert release.release_date is None # Should allow NULL - assert release.label == str(expected_version) # Default label behavior + assert release.release_date == date + assert release.label == label assert release.release_type == "partial" assert release.status == ReleaseStatus.PLANNED @@ -200,3 +199,21 @@ def test_pre_release_check_processed_alternative(self, test_dbs): factory = ReleaseFactory(test_dbs['ensembl_genome_metadata'].dbc.url) errors = factory.pre_release_check("4") assert not errors, f"Unexpected errors found: {errors}" + + +@pytest.mark.parametrize("test_dbs", [[{'src': Path(__file__).parent / "databases/ensembl_genome_metadata"}, + {'src': Path(__file__).parent / "databases/ncbi_taxonomy"}, + ]], indirect=True) +class TestFactoryUtils: + dbc: UnitTestDB = None + + def test_get_genome_sets_by_assembly_and_provider(self, test_dbs) -> None: + """ + Test 'get_genome_sets_by_assembly_and_provider'. + Pretty bad test. We haven't populated the metadata here with an updated genome so it just returns an empty set. + """ + metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) + + with metadata_db.session_scope() as session: + genome_sets = get_genome_sets_by_assembly_and_provider(session) + assert genome_sets == {} diff --git a/src/tests/test_scripts.py b/src/tests/test_scripts.py new file mode 100644 index 00000000..206da357 --- /dev/null +++ b/src/tests/test_scripts.py @@ -0,0 +1,334 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import namedtuple +from unittest.mock import patch +from urllib.parse import urlparse + +import pytest + +from ensembl.production.metadata.api.models import Assembly +from ensembl.production.metadata.api.models import OrganismGroup +from ensembl.production.metadata.scripts.copy_handover_files import * +from ensembl.production.metadata.scripts.create_datasets_json import * +from ensembl.production.metadata.scripts.delete_ftp_by_uuid import * +from ensembl.production.metadata.scripts.organism_to_organismgroup import * + +db_directory = Path(__file__).parent / 'databases' +db_directory = db_directory.resolve() + +# Define a named tuple for script args +Args = namedtuple('Args', [ + 'metadata_db_uri', 'core_server_uri', 'organism_group_type', + 'organism_group_name', 'genome_uuid', 'release_id', 'remove', 'raise_error' +]) + + +@pytest.mark.parametrize( + "test_dbs", + [[ + {"src": db_directory / "ensembl_genome_metadata"}, + {'src': db_directory / "ncbi_taxonomy"}, + {'src': db_directory / "core_1"}, + ]], + indirect=True, +) +class TestScripts: + """Test suite for various metadata scripts.""" + + def test_check_directory_single_path_valid(self, test_dbs, tmp_path): + """Test check_directory function with single valid directory (returns string).""" + test_dir = tmp_path / "test_dir" + test_dir.mkdir() + result = check_directory(str(test_dir)) + assert result == str(test_dir) + + def test_check_directory_invalid(self, test_dbs): + """Test check_directory function with invalid directory.""" + with pytest.raises(argparse.ArgumentTypeError) as excinfo: + check_directory("/nonexistent/directory/path") + assert "does not exist" in str(excinfo.value) + + def test_generate_full_paths(self, test_dbs): + """Test generate_full_paths creates correct FTP and NFS paths.""" + relative_paths = ["species1/assembly1", "species2/assembly2"] + ftp_root = "/ftp/root/" + nfs_root = "/nfs/root/" + + result = generate_full_paths(relative_paths, ftp_root, nfs_root) + + assert len(result) == 4 # 2 relative paths * 2 roots + assert "/ftp/root/species1/assembly1" in result + assert "/nfs/root/species1/assembly1" in result + assert "/ftp/root/species2/assembly2" in result + assert "/nfs/root/species2/assembly2" in result + + def test_generate_full_paths_empty(self, test_dbs): + """Test generate_full_paths with empty input.""" + result = generate_full_paths([], "/ftp/", "/nfs/") + assert result == [] + + def test_submit_slurm_job_test_mode(self, test_dbs, capsys): + """Test submit_slurm_job in test mode (no actual submission).""" + paths = ["/path1", "/path2"] + submit_slurm_job(paths, test=True) + + captured = capsys.readouterr() + assert "[TEST MODE]" in captured.out + assert "/path1" in captured.out + assert "/path2" in captured.out + + def test_submit_slurm_job_empty_paths(self, test_dbs, capsys): + """Test submit_slurm_job with empty paths list.""" + submit_slurm_job([], test=False) + + captured = capsys.readouterr() + assert "No paths to delete" in captured.out + + @patch('subprocess.run') + def test_submit_slurm_job_actual_submission(self, mock_subprocess, test_dbs): + """Test submit_slurm_job makes correct subprocess call.""" + paths = ["/path1", "/path2"] + submit_slurm_job(paths, test=False) + + # Verify subprocess.run was called + mock_subprocess.assert_called_once() + call_args = mock_subprocess.call_args[0][0] + assert "sbatch" in call_args + assert "--wrap" in call_args + + def test_variation_tracks_json_parsing(self, test_dbs, tmp_path): + """Test variation_tracks function parses JSON correctly.""" + # Create test JSON file + test_data = { + "genome-uuid-1": { + "datafiles": { + "file1": str(tmp_path / "source1.vcf"), + "file2": str(tmp_path / "source2.vcf") + } + } + } + + # Create source files + (tmp_path / "source1.vcf").touch() + (tmp_path / "source2.vcf").touch() + + json_file = tmp_path / "test.json" + with open(json_file, 'w') as f: + json.dump(test_data, f) + + dest_dir = tmp_path / "destination" + dest_dir.mkdir() + + # Run the function + variation_tracks(str(json_file), "release_1", [str(dest_dir) + "/"]) + + # Verify files were copied + genome_dir = dest_dir / "genome-uuid-1" + assert genome_dir.exists() + assert (genome_dir / "source1.vcf").exists() + assert (genome_dir / "source2.vcf").exists() + + def test_variation_tracks_invalid_json(self, test_dbs, tmp_path): + """Test variation_tracks handles invalid JSON gracefully.""" + json_file = tmp_path / "invalid.json" + with open(json_file, 'w') as f: + f.write("not valid json{") + + with pytest.raises(Exception): + variation_tracks(str(json_file), "release_1", ["/tmp/"]) + + def test_regulation_copy_creates_directory(self, test_dbs, tmp_path): + """Test regulation_copy creates destination directories.""" + source_file = tmp_path / "source.bb" + source_file.touch() + + test_data = [ + { + "genome_uuid": "test-genome-uuid", + "dataset_source": {"name": str(source_file), "type": "bigbed"}, + "dataset_type": "regulation", + "dataset_attribute": [], + "name": "test_regulation", + "label": "test_label", + "version": "1.0", + } + ] + + json_file = tmp_path / "regulation.json" + with open(json_file, 'w') as f: + json.dump(test_data, f) + dest_base = tmp_path / "destination" + dest_base.mkdir() + regulation_copy(str(json_file), "release_1", [str(dest_base) + "/"]) + expected_dir = dest_base / "test-genome-uuid" + assert expected_dir.exists() + expected_file = expected_dir / f"regulatory-features{source_file.suffix}" + assert expected_file.exists() + + def test_fetch_division_name(self, test_dbs): + """Test fetch_division_name retrieves division from core database.""" + core_uri = test_dbs.get('core_1') + if core_uri: + with DBConnection(core_uri.dbc.url).session_scope() as session: + division = session.query(Meta).filter( + Meta.meta_key == 'species.division' + ).first() + result = fetch_division_name(core_uri.dbc.url) + if division: + assert result == division.meta_value + else: + assert result is None + + def test_create_organism_group_member(self, test_dbs): + """Test create_or_remove_organism_group creates new member.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + + with DBConnection(metadata_uri).session_scope() as session: + organism = session.query(Organism).first() + org_group = session.query(OrganismGroup).first() + if organism and org_group: + existing = session.query(OrganismGroupMember).filter( + OrganismGroupMember.organism_id == organism.organism_id, + OrganismGroupMember.organism_group_id == org_group.organism_group_id + ).first() + if not existing: + msg = create_or_remove_organism_group( + session, organism.organism_id, org_group.organism_group_id, remove=False + ) + assert "created successfully" in msg or "already exists" in msg + member = session.query(OrganismGroupMember).filter( + OrganismGroupMember.organism_id == organism.organism_id, + OrganismGroupMember.organism_group_id == org_group.organism_group_id + ).first() + assert member is not None + + def test_remove_organism_group_member(self, test_dbs): + """Test create_or_remove_organism_group removes member.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + with DBConnection(metadata_uri).session_scope() as session: + member = session.query(OrganismGroupMember).first() + if member: + organism_id = member.organism_id + group_id = member.organism_group_id + msg = create_or_remove_organism_group( + session, organism_id, group_id, remove=True + ) + assert "removed successfully" in msg or "not found" in msg + + def test_json_file_structure_for_ftp_copy(self, test_dbs, tmp_path): + """Test that ftp_copy can parse expected JSON structure.""" + test_data = [ + { + "genome_uuid": "test-uuid", + "dataset_source": { + "name": str(tmp_path / "test.file"), + "type": "vep" + }, + "dataset_type": "vep", + "name": "test_vep", + "label": "test_label", + "version": "1.0" + } + ] + json_file = tmp_path / "test_ftp.json" + with open(json_file, 'w') as f: + json.dump(test_data, f) + with open(json_file, 'r') as f: + loaded_data = json.load(f) + assert len(loaded_data) == 1 + assert loaded_data[0]['genome_uuid'] == "test-uuid" + assert loaded_data[0]['dataset_type'] == "vep" + + def test_duckdb_script_environment_variable(self, test_dbs, monkeypatch): + """Test that DuckDB script reads from environment variable.""" + test_uri = "mysql://testuser:testpass@testhost:3306/testdb" + monkeypatch.setenv('METADATA_DB', test_uri) + db = urlparse(os.environ.get('METADATA_DB')) + assert db.hostname == "testhost" + assert db.port == 3306 + assert db.username == "testuser" + assert db.path[1:] == "testdb" + + def test_ftp_metadata_paths_structure(self, test_dbs): + """Test that genome public path structure is correct for FTP metadata.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + with DBConnection(metadata_uri).session_scope() as session: + genome = session.query(Genome).first() + if genome and hasattr(genome, 'get_public_path'): + paths = genome.get_public_path(dataset_type='genebuild') + assert isinstance(paths, list) + if len(paths) > 0: + first_path = paths[0] + assert 'dataset_type' in first_path or 'path' in first_path + + def test_genome_public_path_all_types(self, test_dbs): + """Test genome.get_public_path with 'all' dataset type.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + with DBConnection(metadata_uri).session_scope() as session: + genome = session.query(Genome).first() + if genome and hasattr(genome, 'get_public_path'): + paths = genome.get_public_path(dataset_type='all') + assert isinstance(paths, list) + if len(paths) > 1: + dataset_types = {p.get('dataset_type') for p in paths if 'dataset_type' in p} + assert len(dataset_types) > 1 + + def test_ftp_delete_checks_shared_organism(self, test_dbs): + """Test that FTP delete logic checks for shared organisms.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + + with DBConnection(metadata_uri).session_scope() as session: + genome = session.query(Genome).first() + if genome: + other_genomes_count = session.query(Genome).filter( + Genome.organism_id == genome.organism_id, + Genome.genome_uuid != genome.genome_uuid + ).count() + assert isinstance(other_genomes_count, int) + assert other_genomes_count >= 0 + + def test_ftp_delete_checks_shared_assembly(self, test_dbs): + """Test that FTP delete logic checks for shared assemblies.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + + with DBConnection(metadata_uri).session_scope() as session: + genome = session.query(Genome).first() + if genome: + other_assemblies_count = session.query(Genome).filter( + Genome.assembly_id == genome.assembly_id, + Genome.genome_uuid != genome.genome_uuid + ).count() + assert isinstance(other_assemblies_count, int) + assert other_assemblies_count >= 0 + + def test_organism_scientific_name_formatting(self, test_dbs): + """Test that organism scientific names are formatted correctly for paths.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + + with DBConnection(metadata_uri).session_scope() as session: + organism = session.query(Organism).first() + if organism: + scientific_name = organism.scientific_name + formatted_name = scientific_name.replace(" ", "_") + assert " " not in formatted_name + assert "_" in formatted_name or len(scientific_name.split()) == 1 + + def test_assembly_accession_in_paths(self, test_dbs): + """Test that assembly accessions are available for path construction.""" + metadata_uri = test_dbs['ensembl_genome_metadata'].dbc.url + + with DBConnection(metadata_uri).session_scope() as session: + assembly = session.query(Assembly).first() + if assembly: + assert assembly.accession is not None + assert len(assembly.accession) > 0 + assert " " not in assembly.accession diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index 823ef75b..e5100d6a 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -34,7 +34,6 @@ {'src': Path(__file__).parent / "databases/core_6"}, {'src': Path(__file__).parent / "databases/core_7"}, {'src': Path(__file__).parent / "databases/core_8"}, - {'src': Path(__file__).parent / "databases/core_9"} ]], indirect=True) class TestUpdater: @@ -42,7 +41,8 @@ class TestUpdater: def test_new_organism(self, test_dbs): test = meta_factory(test_dbs['core_1'].dbc.url, - test_dbs['ensembl_genome_metadata'].dbc.url) + test_dbs['ensembl_genome_metadata'].dbc.url, + test_dbs['ncbi_taxonomy'].dbc.url) test.process_core() # Check for insertion of genome_uuid @@ -64,11 +64,9 @@ def test_new_organism(self, test_dbs): organism = session.query(Organism).where(Organism.biosample_id == 'Jabberwocky').first() assembly = session.query(Assembly).where(Assembly.name == 'jaber01').first() assert organism.scientific_name == 'carol_jabberwocky' - assert organism.genomes[0].genebuild_version == 'ENS01' assert organism.genomes[0].genebuild_date == '2023-01' # Test the Assembly assert assembly.accession == 'GCF_1111111123.3' - assert assembly.alt_accession == 'GCA_0000012345.3' # select * from genebuild where version = 999 and name = 'genebuild and label =01 dataset = session.query(Dataset).where( (Dataset.version == 'ENS01') & (Dataset.name == 'genebuild') @@ -79,17 +77,17 @@ def test_new_organism(self, test_dbs): assert dataset.dataset_type.name == "genebuild" # Testing assembly sequence is circular sequence = session.query(AssemblySequence).where( - (AssemblySequence.is_circular == 1) & (AssemblySequence.name == 'TEST1_seqA') + (AssemblySequence.is_circular == 1) & (AssemblySequence.name == 'AA123456.1') ).first() assert sequence is not None assert sequence.type == "primary_assembly" # Testing assembly_sequence.type sequence2 = session.query(AssemblySequence).where( - (AssemblySequence.is_circular == 0) & (AssemblySequence.name == 'TEST2_seqB') + (AssemblySequence.is_circular == 0) & (AssemblySequence.name == 'AA123456.2') ).first() assert sequence2 is not None assert sequence.type == "primary_assembly" sequence3 = session.query(AssemblySequence).where( - (AssemblySequence.is_circular == 0) & (AssemblySequence.name == 'TEST3_seqC') + (AssemblySequence.is_circular == 0) & (AssemblySequence.name == 'AA123456.3') ).first() assert sequence3 is not None count = session.query(Dataset).join(DatasetSource).join(DatasetType) \ @@ -101,7 +99,9 @@ def test_new_organism(self, test_dbs): assert count == 1 def test_fail_existing_genome_uuid_no_data(self, test_dbs): - test = meta_factory(test_dbs['core_2'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url) + test = meta_factory(test_dbs['core_2'].dbc.url, + test_dbs['ensembl_genome_metadata'].dbc.url, + test_dbs['ncbi_taxonomy'].dbc.url) with pytest.raises(MetadataUpdateException) as exif: test.process_core() assert ("Database contains a Genome.genome_uuid, " @@ -109,35 +109,68 @@ def test_fail_existing_genome_uuid_no_data(self, test_dbs): "Please remove it from the meta key and resubmit" in str(exif.value)) def test_update_assembly(self, test_dbs): - test = meta_factory(test_dbs['core_3'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url) + test = meta_factory(test_dbs['core_3'].dbc.url, + test_dbs['ensembl_genome_metadata'].dbc.url, + test_dbs['ncbi_taxonomy'].dbc.url) test.process_core() + + core_3_db = DBConnection(test_dbs['core_3'].dbc.url) + with core_3_db.session_scope() as core_session: + inserted_meta = core_session.query(Meta).filter( + Meta.species_id == "1", + Meta.meta_key == 'genome.genome_uuid' + ).first() + inserted_genome_uuid = inserted_meta.meta_value + metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: - organism = session.query(Organism).where(Organism.biosample_id == 'Jabberwocky').first() - assert organism.scientific_name == 'carol_jabberwocky' - assert organism.genomes[1].assembly.accession == 'weird02' - assert organism.genomes[1].genebuild_version == 'ENS01' - assert organism.genomes[1].genebuild_date == '2024-02' + genome = session.query(Genome).filter( + Genome.genome_uuid == inserted_genome_uuid + ).one() + organism = genome.organism + assert organism.scientific_name == 'carol_jabberwocky' + assert genome.assembly.accession == 'weird02' + assert genome.genebuild_date == '2024-02' # def test_update_geneset(self, test_dbs): - test = meta_factory(test_dbs['core_4'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url) + # Run the update process + test = meta_factory(test_dbs['core_4'].dbc.url, + test_dbs['ensembl_genome_metadata'].dbc.url, + test_dbs['ncbi_taxonomy'].dbc.url) test.process_core() + + # Get the genome_uuid that was just inserted into core_4 by the process + core_4_db = DBConnection(test_dbs['core_4'].dbc.url) + with core_4_db.session_scope() as core_session: + inserted_meta = core_session.query(Meta).filter( + Meta.species_id == "1", + Meta.meta_key == 'genome.genome_uuid' + ).first() + inserted_genome_uuid = inserted_meta.meta_value + metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: - dataset = session.query(Dataset).where( - (Dataset.version == "ENS02") & (Dataset.name == 'genebuild') - ).first() - assert dataset is not None - assert re.match(".*_core_4", dataset.dataset_source.name) - assert dataset.dataset_source.type == "core" - assert dataset.dataset_type.name == "genebuild" - assert dataset.genome_datasets[0].genome.genebuild_version == 'ENS02' - assert dataset.genome_datasets[0].genome.genebuild_date == '2023-01' - assert dataset.genome_datasets[0].genome.genome_releases is not None + genome = session.query(Genome).filter( + Genome.genome_uuid == inserted_genome_uuid + ).one() + genebuild_dataset = session.query(Dataset).join(GenomeDataset).join(Genome).filter( + Genome.genome_uuid == inserted_genome_uuid, + Dataset.name == "genebuild" + ).one() + + assert genebuild_dataset is not None + + assert re.match(".*_core_4", genebuild_dataset.dataset_source.name) + assert genebuild_dataset.dataset_source.type == "core" + assert genebuild_dataset.dataset_type.name == "genebuild" + assert genome.genebuild_date == '2023-01' # From core_4 meta table + assert len(genome.genome_releases) > 0 def test_taxonomy_common_name(self, test_dbs): - test = meta_factory(test_dbs['core_5'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url) + test = meta_factory(test_dbs['core_5'].dbc.url, + test_dbs['ensembl_genome_metadata'].dbc.url, + test_dbs['ncbi_taxonomy'].dbc.url) test.process_core() metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: @@ -145,55 +178,18 @@ def test_taxonomy_common_name(self, test_dbs): assert organism.common_name == 'Sheep' def test_fail_existing_genome_uuid_data_not_match(self, test_dbs): - test = meta_factory(test_dbs['core_6'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url) + test = meta_factory(test_dbs['core_6'].dbc.url, + test_dbs['ensembl_genome_metadata'].dbc.url, + test_dbs['ncbi_taxonomy'].dbc.url) with pytest.raises(MetadataUpdateException) as exif: test.process_core() assert ("Core database contains a genome.genome_uuid which matches an entry in the meta table. " "The force flag was not specified so the core was not updated." in str(exif.value)) - # def test_update_unreleased_no_force(self, test_dbs): - # test = meta_factory(test_dbs['core_7'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url) - # test.process_core() - # metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) - # with metadata_db.session_scope() as session: - # # Check that the old datasets have been removed - # genebuild_test = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( - # DatasetSource.name.like('%core_5'), - # ).filter(DatasetType.name == "genebuild").one_or_none() - # assert genebuild_test is None - # - # count = session.query(DatasetAttribute).join(Attribute).filter( - # Attribute.name == 'genebuild.provider_name', - # DatasetAttribute.value == 'removed_for_test' - # ).count() - # assert count == 0 - # - # # Check that the new dataset are present and not duplicated - # count = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( - # DatasetSource.name.like('%core_7'), - # DatasetType.name == 'assembly' - # ).count() - # assert count == 0 - # count = session.query(Dataset).join(DatasetSource).join(DatasetType).filter( - # DatasetSource.name.like('%core_7'), - # DatasetType.name == 'genebuild' - # ).count() - # assert count == 1 - # # Check that new assembly attribute values are not present - # count = session.query(DatasetAttribute).join(Attribute).filter( - # Attribute.name == 'assembly.ucsc_alias', - # DatasetAttribute.value == 'test_alias' - # ).count() - # assert count == 0 - # # Check that new genebuild attribute values are present - # count = session.query(DatasetAttribute).join(Attribute).filter( - # Attribute.name == 'genebuild.havana_datafreeze_date', - # DatasetAttribute.value == 'test2' - # ).count() - # assert count > 0 - def test_update_released(self, test_dbs): - test = meta_factory(test_dbs['core_8'].dbc.url, test_dbs['ensembl_genome_metadata'].dbc.url) + test = meta_factory(test_dbs['core_8'].dbc.url, + test_dbs['ensembl_genome_metadata'].dbc.url, + test_dbs['ncbi_taxonomy'].dbc.url) with pytest.raises(Exception) as exif: test.process_core() assert ("Existing Organism, Assembly, and Datasets within a release. ") diff --git a/src/tests/test_utils.py b/src/tests/test_utils.py index caaa13ae..98dab17d 100644 --- a/src/tests/test_utils.py +++ b/src/tests/test_utils.py @@ -90,11 +90,11 @@ def test_get_genomes_from_assembly_accession_iterator(self, genome_conn, allow_u output = [ json.loads(json_format.MessageToJson(response)) for response in utils.get_genomes_from_assembly_accession_iterator( - db_conn=genome_conn, assembly_accession="GCA_000005845.2", release_version=None + db_conn=genome_conn, assembly_accession="GCA_000005845.2" ) ] - assert len(output) == expected_count + assert len(output) == 1 @pytest.mark.parametrize( "assembly_accession, release_version", @@ -523,8 +523,8 @@ def test_get_genomes_by_name(self, genome_conn): }, 'release': { 'isCurrent': True, - 'releaseDate': '2023-10-18', - 'releaseLabel': 'MVP Beta-1', + 'releaseDate': '2020-10-18', + 'releaseLabel': '2020-10-18', 'releaseType': 'partial', 'releaseVersion': 110.1, 'siteLabel': 'MVP Ensembl', @@ -562,8 +562,7 @@ def test_get_genomes_by_name_release_unspecified(self, genome_conn): 'ensemblName': 'WBcel235', 'isReference': True, 'level': 'chromosome', - 'name': 'WBcel235', - 'urlName': 'wbcel235' + 'name': 'WBcel235' }, 'attributesInfo': { 'assemblyDate': '2012-12', @@ -594,7 +593,7 @@ def test_get_genomes_by_name_release_unspecified(self, genome_conn): }, 'release': { 'releaseDate': '2023-06-15', - 'releaseLabel': 'First Beta', + 'releaseLabel': '2023-06-15', 'releaseType': 'partial', 'releaseVersion': 108.0, 'siteLabel': 'MVP Ensembl', @@ -642,7 +641,7 @@ def test_get_genomes_by_name_release_unspecified(self, genome_conn): "genome_tag, expected_output", [ # url_name = GRCh38 => homo_sapien 38 - ("GRCh38", {"genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3"}), + ("grch38", {"genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3"}), # Null ("iDontExist", {}), ] diff --git a/src/tests/tests_exports.py b/src/tests/tests_exports.py deleted file mode 100644 index 536aec6f..00000000 --- a/src/tests/tests_exports.py +++ /dev/null @@ -1,20 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Unit tests for utils.py -""" -import logging - -logger = logging.getLogger(__name__) - -# TODO create tests for the stats generator and the changlog generator. Wait for the new schema as this will be -# useless to do now.