Ensembl · dpopleton · Mar 4, 2024 · Feb 7, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.0.0
+2.0.1
diff --git a/requirements.in b/requirements.in
@@ -1,6 +1,7 @@
 ensembl-py@git+https://github.com/Ensembl/ensembl-py.git@1.2.2
 grpcio
 grpcio-tools
-grpcio-reflection
-sqlalchemy
-types-pymysql
+sqlalchemy<=2.0
+types-pymysql
+urllib3~=1.26.15
+
diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile requirements.in
 #
-certifi==2024.2.2
+certifi==2023.11.17
     # via requests
 charset-normalizer==3.3.2
     # via requests
@@ -16,14 +16,11 @@ exceptiongroup==1.2.0
     # via pytest
 greenlet==3.0.3
     # via sqlalchemy
-grpcio==1.62.0
+grpcio==1.60.0
     # via
     #   -r requirements.in
-    #   grpcio-reflection
     #   grpcio-tools
-grpcio-reflection==1.62.0
-    # via -r requirements.in
-grpcio-tools==1.62.0
+grpcio-tools==1.60.0
     # via -r requirements.in
 idna==3.6
     # via requests
@@ -33,13 +30,11 @@ mysqlclient==2.1.1
     # via ensembl-py
 packaging==23.2
     # via pytest
-pluggy==1.4.0
+pluggy==1.3.0
     # via pytest
-protobuf==4.25.3
-    # via
-    #   grpcio-reflection
-    #   grpcio-tools
-pytest==8.0.2
+protobuf==4.25.2
+    # via grpcio-tools
+pytest==7.4.4
     # via
     #   ensembl-py
     #   pytest-dependency
@@ -62,8 +57,10 @@ tomli==2.0.1
     # via pytest
 types-pymysql==1.1.0.1
     # via -r requirements.in
-urllib3==2.2.1
-    # via requests
+urllib3==1.26.18
+    # via
+    #   -r requirements.in
+    #   requests
 
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools
diff --git a/src/ensembl/production/metadata/api/exceptions.py b/src/ensembl/production/metadata/api/exceptions.py
@@ -47,4 +47,8 @@ class UpdateBackCoreException(UpdaterException, RuntimeError):
 
 class TypeNotFoundException(UpdaterException, RuntimeError):
     """Dataset Type not found"""
-    pass
+    pass
+
+class DatasetFactoryException(Exception):
+    """An error occured while using dataset factory"""
+    pass
diff --git a/.../production/metadata/api/hive/__init__.py → ...uction/metadata/api/factories/__init__.py b/.../production/metadata/api/hive/__init__.py → ...uction/metadata/api/factories/__init__.py
diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py
diff --git a/src/ensembl/production/metadata/api/factories/genome.py b/src/ensembl/production/metadata/api/factories/genome.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python
+#  See the NOTICE file distributed with this work for additional information
+#  regarding copyright ownership.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+'''
+Fetch Genome Info From New Metadata Database
+'''
+
+import argparse
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from ensembl.database import DBConnection
+from ensembl.production.metadata.api.factories.datasets import DatasetFactory
+from ensembl.production.metadata.api.models.dataset import DatasetType, Dataset, DatasetSource
+from ensembl.production.metadata.api.models.genome import Genome, GenomeDataset
+from ensembl.production.metadata.api.models.organism import Organism, OrganismGroup, OrganismGroupMember
+from sqlalchemy import select, text
+from typing import List
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class GenomeInputFilters:
+
+    metadata_db_uri: str
+    genome_uuid: List[str] = field(default_factory=list)
+    dataset_uuid: List[str] = field(default_factory=list)
+    division: List[str] = field(default_factory=list)
+    dataset_type: str = "assembly"
+    species: List[str] = field(default_factory=list)
+    antispecies: List[str] = field(default_factory=list)
+    dataset_status: List[str] = field(default_factory=lambda: ["Submitted"])
+    batch_size: int = 50
+    page: int = 1
+    organism_group_type: str = "DIVISION"
+    run_all: int = 0
+    update_dataset_status: str = ""
+    update_dataset_attribute: dict = field(default_factory=lambda: {})
+    columns: List = field(default_factory=lambda: [Genome.genome_uuid,
+                                                   Genome.production_name.label('species'),
+                                                   Dataset.dataset_uuid,
+                                                   Dataset.status.label('dataset_status'),
+                                                   DatasetSource.name.label('dataset_source'),
+                                                   DatasetType.name.label('dataset_type'),
+                                                   ])
+@dataclass
+class GenomeFactory:
+    @staticmethod
+    def _apply_filters(query, filters):
+
+        query = query.filter(OrganismGroup.type == filters.organism_group_type)
+
+        if filters.run_all:
+            filters.division = [
+                                'EnsemblBacteria',
+                                'EnsemblVertebrates',
+                                'EnsemblPlants',
+                                'EnsemblProtists',
+                                'EnsemblMetazoa',
+                                'EnsemblFungi',
+                            ]
+
+        if filters.genome_uuid:
+            query = query.filter(Genome.genome_uuid.in_(filters.genome_uuid))
+
+        if filters.dataset_uuid:
+            query = query.filter(Dataset.dataset_uuid.in_(filters.dataset_uuid))
+
+        if filters.division:
+            ensembl_divisions = filters.division
+
+            if filters.organism_group_type == 'DIVISION':
+                pattern = re.compile(r'^(ensembl)?', re.IGNORECASE)
+                ensembl_divisions = ['Ensembl' + pattern.sub('', d).capitalize() for d in ensembl_divisions if d]
+
+            query = query.filter(OrganismGroup.name.in_(ensembl_divisions))
+
+        if filters.species:
+            species = set(filters.species) - set(filters.antispecies)
+
+            if species:
+                query = query.filter(Genome.production_name.in_(filters.species))
+            else:
+                query = query.filter(~Genome.production_name.in_(filters.antispecies))
+
+        elif filters.antispecies:
+            query = query.filter(~Genome.production_name.in_(filters.antispecies))
+
+        if filters.dataset_type:
+            query = query.filter(Genome.genome_datasets.any(DatasetType.name.in_([filters.dataset_type])))
+
+        if filters.dataset_status:
+            query = query.filter(Dataset.status.in_(filters.dataset_status))
+
+        if filters.batch_size:
+            filters.page = filters.page if filters.page > 0 else 1
+            query = query.offset((filters.page - 1) * filters.batch_size).limit(filters.batch_size)
+
+        return query
+
+    def _build_query(self, filters):
+        query = select(filters.columns) \
+            .select_from(Genome) \
+            .join(Genome.organism) \
+            .join(Organism.organism_group_members) \
+            .join(OrganismGroupMember.organism_group) \
+            .outerjoin(Genome.genome_datasets) \
+            .join(GenomeDataset.dataset) \
+            .join(Dataset.dataset_source) \
+            .join(Dataset.dataset_type) \
+            .group_by(Genome.genome_id, Dataset.dataset_id) \
+            .order_by(Genome.genome_uuid)
+
+        return self._apply_filters(query, filters)
+
+    def get_genomes(self, **filters: GenomeInputFilters):
+
+        filters = GenomeInputFilters(**filters)
+        logger.info(f'Get Genomes with filters {filters}')
+
+        with DBConnection(filters.metadata_db_uri).session_scope() as session:
+            query = self._build_query(filters)
+            logger.info(f'Executing SQL query: {query}')
+            for genome in session.execute(query).fetchall():
+                genome_info = genome._asdict()
+                dataset_uuid = genome_info.get('dataset_uuid', None)
+
+                # TODO: below code required with implementation of datasetstatus enum class in dataset models
+                # #convert status enum object to string value
+                # dataset_status = genome_info.get('dataset_status', None)
+                # if dataset_status and  isinstance(dataset_status, DatasetStatus) :
+                #     genome_info['dataset_status'] = dataset_status.value
+
+                if not dataset_uuid:
+                    logger.warning(
+                        f"No dataset uuid found for genome {genome_info} skipping this genome "
+                    )
+                    continue
+
+                if filters.update_dataset_status:
+                    _, status = DatasetFactory().update_dataset_status(dataset_uuid, filters.update_dataset_status,
+                                                                      session=session)
+                    if filters.update_dataset_status == status:
+
+                        logger.info(
+                            f"Updated Dataset status for dataset uuid: {dataset_uuid} from {filters.update_dataset_status} to {status}  for genome {genome_info['genome_uuid']}"
+                        )
+                        genome_info['updated_dataset_status'] = status
+
+                    else:
+                        logger.warning(
+                            f"Cannot update status for dataset uuid: {dataset_uuid} {filters.update_dataset_status} to {status}  for genome {genome['genome_uuid']}"
+                        )
+                        genome_info['updated_dataset_status'] = None
+
+                yield genome_info
+
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog='genome.py',
+        description='Fetch Ensembl genome info from the new metadata database'
+    )
+    parser.add_argument('--genome_uuid', type=str, nargs='*', default=[], required=False,
+                        help='List of genome UUIDs to filter the query. Default is an empty list.')
+    parser.add_argument('--dataset_uuid', type=str, nargs='*', default=[], required=False,
+                        help='List of dataset UUIDs to filter the query. Default is an empty list.')
+    parser.add_argument('--organism_group_type', type=str, default='DIVISION', required=False,
+                        help='Organism group type to filter the query. Default is "DIVISION"')
+    parser.add_argument('--division', type=str, nargs='*', default=[], required=False,
+                        help='List of organism group names to filter the query. Default is an empty list.')
+    parser.add_argument('--dataset_type', type=str, default="assembly", required=False,
+                        help='List of dataset types to filter the query. Default is an empty list.')
+    parser.add_argument('--species', type=str, nargs='*', default=[], required=False,
+                        help='List of Species Production names to filter the query. Default is an empty list.')
+    parser.add_argument('--antispecies', type=str, nargs='*', default=[], required=False,
+                        help='List of Species Production names to exclude from the query. Default is an empty list.')
+    parser.add_argument('--dataset_status', nargs='*', default=["Submitted"],
+                        choices=['Submitted', 'Processing', 'Processed', 'Released'], required=False,
+                        help='List of dataset statuses to filter the query. Default is an empty list.')
+    parser.add_argument('--update_dataset_status', type=str, default="", required=False,
+                        choices=['Submitted', 'Processing', 'Processed', 'Released', ''],
+                        help='Update the status of the selected datasets to the specified value. ')
+    parser.add_argument('--batch_size', type=int, default=50, required=False,
+                        help='Number of results to retrieve per batch. Default is 50.')
+    parser.add_argument('--page',  default=1, required=False,
+                        type=lambda x: int(x) if int(x) > 0 else argparse.ArgumentTypeError("{x} is not a positive integer"),
+                        help='The page number for pagination. Default is 1.')
+    parser.add_argument('--metadata_db_uri', type=str, required=True,
+                        help='metadata db mysql uri, ex: mysql://ensro@localhost:3366/ensembl_genome_metadata')
+    parser.add_argument('--output', type=str, required=True, help='output file ex: genome_info.json')
+
+    args = parser.parse_args()
+
+    meta_details = re.match(r"mysql:\/\/.*:?(.*?)@(.*?):\d+\/(.*)", args.metadata_db_uri)
+    with open(args.output, 'w') as json_output:
+        logger.info(f'Connecting Metadata Database with  host:{meta_details.group(2)} & dbname:{meta_details.group(3)}')
+
+        genome_fetcher = GenomeFactory()
+
+        logger.info(f'Writing Results to {args.output}')
+        for genome in genome_fetcher.get_genomes(
+                metadata_db_uri=args.metadata_db_uri,
+                update_dataset_status=args.update_dataset_status,
+                genome_uuid=args.genome_uuid,
+                dataset_uuid=args.dataset_uuid,
+                organism_group_type=args.organism_group_type,
+                division=args.division,
+                dataset_type=args.dataset_type,
+                species=args.species,
+                antispecies=args.antispecies,
+                batch_size=args.batch_size,
+                dataset_status=args.dataset_status,
+        ) or []:
+            json.dump(genome, json_output)
+            json_output.write("\n")
+
+        logger.info(f'Completed !')
+
+
+if __name__ == "__main__":
+    logger.info('Fetching Genome Information From New Metadata Database')
+    main()
diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py
@@ -9,13 +9,14 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
-from sqlalchemy import Column, Integer, String, Enum, text, ForeignKey, Index
+import datetime
+import logging
+import uuid
+
+from sqlalchemy import Column, Integer, String, Enum, text, ForeignKey, Index, JSON
 from sqlalchemy.dialects.mysql import DATETIME
 from sqlalchemy.orm import relationship
 from sqlalchemy.sql import func
-import datetime
-import uuid
-import logging
 
 from ensembl.production.metadata.api.exceptions import MissingMetaException
 from ensembl.production.metadata.api.models.base import Base, LoadAble
@@ -48,8 +49,7 @@ class Dataset(LoadAble, Base):
     created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow)
     dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True)
     label = Column(String(128), nullable=False)
-    status = Column(Enum('Submitted', 'Progressing', 'Processed', 'Released'),
-                    server_default=text("'Submitted'"))
+    status = Column(Enum('Submitted', 'Processing', 'Processed', 'Released'), server_default=text('Submitted'))
 
     # One to many relationships
     # dataset_id to dataset attribute and genome dataset
@@ -126,6 +126,9 @@ class DatasetType(LoadAble, Base):
     topic = Column(String(32), nullable=False)
     description = Column(String(255))
     details_uri = Column(String(255))
+    parent = Column(String(128), default=None)
+    depends_on = Column(String(128), default=None)
+    filter_on = Column(JSON, default=None)
     # One to many relationships
     # dataset_type_id to dataset
     datasets = relationship('Dataset', back_populates='dataset_type')

diff --git a/src/ensembl/production/metadata/api/sample/ensembl_genome_metadata/dataset_type.txt b/src/ensembl/production/metadata/api/sample/ensembl_genome_metadata/dataset_type.txt
@@ -1,7 +1,22 @@
-1	assembly	Genomic assembly	Core Annotation	Compilation of sequences for a genome	\N
-2	genebuild	Genomic Build	Core Annotation	Genomic annotations for an assembly	\N
-3	variation	mRatBN7.2	Variation Annotation	Short variant data for rattus_norvegicus	\N
-4	evidence	Variation Evidence	Variation Annotation	\N	\N
-5	regulation_build	Regulations	Regulatory Annotation	\N	\N
-6	homologies	Comparative homologies	Comparative Annotation	\N	\N
-7	regulatory_features	Regulatory Annotation	Regulatory Annotation	Regulatory annotation for an assembly	\N
+1	assembly	Genomic assembly	Core Annotation	Compilation of sequences for a genome	\N	\N	\N	\N
+2	genebuild	Genomic Build	Core Annotation	Genomic annotations for an assembly	\N	\N	\N	\N
+3	variation	mRatBN7.2	Variation Annotation	Short variant data for rattus_norvegicus	\N	\N	\N	\N
+4	evidence	Variation Evidence	Variation Annotation	\N	\N	\N	\N	\N
+5	regulation_build	Regulations	Regulatory Annotation	\N	\N	\N	\N	\N
+6	homologies	Comparative homologies	Comparative Annotation	\N	\N	\N	\N	\N
+7	regulatory_features	Regulatory Annotation	Regulatory Annotation	Regulatory annotation for an assembly	\N	\N	\N	\N
+8	xrefs	External References	Production Compute	Xref genome annotation for Genebuild	\N	2	\N	\N
+9	protein_features	Protein Features	Production Compute	Interpro scan run against proteins	\N	2	8	\N
+10	alpha_fold	AlphaFold	Production Compute	AlphaFold compute against proteins	\N	2	9	\N
+11	checksums	Checkums compute	Production Compute	Compute DNA sequences checksums	\N	2	\N	\N
+12	refget_load	Refget Loading	Production Compute	Refeget database provisioning	\N	2	11	\N
+13	compara_load	Compara Data Loading	Production Release Preparation	Load MongoDB homologies	\N	6	15	\N
+14	search_dumps	Data dumps for THOAS	Production Release Preparation	Dumps flat file for THOAS loading	\N	2	1,8,9	\N
+15	compara_compute	Compute homologie database	Production Compute	Compute genome homologies database	\N	6	\N	\N
+16	ftp_dumps	FTP File dumps	Production Release Preparation	Dumps all FTP File format from genebuild	\N	2	1,8,9	\N
+17	compara_dumps	Homologies file dumps	Production Compute	Dumped homologies tsv files	\N	6	15	\N
+18	blast	Blast file dumps	Production Compute	Dumps blast indexed files	\N	2	\N	\N
+20	variation_track	Variation Track API update	Production Release Preparation	Load Variation Track API	\N	3	\N	\N
+21	genome_browser_track	Genebuild Track API update	Production Release Preparation	Load Genebuild track API	\N	2	\N	\N
+22	regulation_track	Regulation Track API update	Production Release Preparation	Load Regulation Track API	\N	7	\N	\N
+23	thoas_load	Thoas Loading	Production Release Preparation	Load MongoDB THOAS collection	\N	2	11,12,14	\N