diff --git a/src/ensembl/production/metadata/api/exceptions.py b/src/ensembl/production/metadata/api/exceptions.py index 1803005c..4cff5b80 100644 --- a/src/ensembl/production/metadata/api/exceptions.py +++ b/src/ensembl/production/metadata/api/exceptions.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + class MetaException(Exception): """Base Metadata API Exception class""" pass @@ -43,3 +44,7 @@ class MissingMetaException(MetaException, RuntimeError): class UpdateBackCoreException(UpdaterException, RuntimeError): """An error occurred while updating back the core database""" pass + +class TypeNotFoundException(UpdaterException, RuntimeError): + """Dataset Type not found""" + pass \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/models/genome.py b/src/ensembl/production/metadata/api/models/genome.py index 4c52cf7e..4e839445 100644 --- a/src/ensembl/production/metadata/api/models/genome.py +++ b/src/ensembl/production/metadata/api/models/genome.py @@ -14,7 +14,7 @@ from sqlalchemy import Column, Integer, String, ForeignKey from sqlalchemy.dialects.mysql import DATETIME, TINYINT from sqlalchemy.orm import relationship - +from ensembl.production.metadata.api.exceptions import * from ensembl.production.metadata.api.models.base import Base, LoadAble @@ -54,20 +54,41 @@ def get_public_path(self, type='all', release=None): genebuild_source_name = genebuild_annotation_source_attribute.value common_path = f"{self.organism.scientific_name.replace(' ', '_')}/{self.assembly.accession}/{genebuild_source_name}" + unique_dataset_types = {gd.dataset.dataset_type.name for gd in self.genome_datasets} + + if 'regulatory_features' in unique_dataset_types or 'regulation_build' in unique_dataset_types: + unique_dataset_types.discard('regulatory_features') + unique_dataset_types.discard('regulation_build') + unique_dataset_types.add('regulation') + if 'regulatory_features' == type or 'regulation_build' == type: + type = 'regulation' - if type in ['genebuild', 'assembly', 'homology', 'regulation', 'variation', 'all']: + if type in unique_dataset_types or type == 'all': if type == 'genebuild': paths.append(f"{common_path}/genebuild/{genebuild_dataset.dataset.version}") elif type == 'assembly': paths.append(f"{common_path}/genome") - elif type in ['homology', 'regulation', 'variation']: - paths.append(f"{common_path}/{type}") + elif type == 'homologies': + paths.append(f"{common_path}/homology") + elif type == 'regulation': + paths.append(f"{common_path}/regulation") + elif type == 'variation': + paths.append(f"{common_path}/variation") elif type == 'all': - # Add paths for all types - for t in ['genebuild', 'assembly', 'homology', 'regulation', 'variation']: - paths.extend(self.get_public_path(type=t)) - return paths - + for t in unique_dataset_types: + if t == 'genebuild': + paths.append(f"{common_path}/genebuild/{genebuild_dataset.dataset.version}") + elif t == 'assembly': + paths.append(f"{common_path}/genome") + elif t == 'homologies': + paths.append(f"{common_path}/homology") + elif t in ['regulation', 'variation']: + paths.append(f"{common_path}/{t}") + else: + raise TypeNotFoundException(f"Dataset Type : {type} has no associated path. ") + return paths + else: + raise TypeNotFoundException(f"Dataset Type : {type} not found in metadata. ") class GenomeDataset(LoadAble, Base): diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt index e4e6c24e..f4794aee 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt @@ -28,4 +28,5 @@ 36 1068ba70-0088-4927-98bd-8fabcfb9a384 4 evidence \N 2023-06-02 13:32:52 10 Manual Add Submitted 38 47d54c33-80d6-4174-8620-52b6c8506db2 6 homologies \N 2023-06-02 13:32:52 11 Manual Add Submitted 42 ea044d8e-33f1-4c9f-9b9f-8c0bd1dcf642 6 homologies \N 2023-06-02 13:32:52 11 Manual Add Submitted -46 385f1ec2-bd06-40ce-873a-98e199f10505 1 asssembly \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted \ No newline at end of file +46 385f1ec2-bd06-40ce-873a-98e199f10505 1 asssembly \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted +47 385f1ec2-bd06-40ce-873a-98e199f10534 5 regulation_build \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt index 49ae11da..b7bd924e 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt @@ -31,3 +31,4 @@ 57 44 7 \N 0 58 45 7 \N 0 59 46 9 \N 0 +60 47 6 1 1 diff --git a/src/ensembl/production/metadata/scripts/genome_uuid_manager.py b/src/ensembl/production/metadata/scripts/genome_uuid_manager.py new file mode 100644 index 00000000..bc89b675 --- /dev/null +++ b/src/ensembl/production/metadata/scripts/genome_uuid_manager.py @@ -0,0 +1,139 @@ +import argparse +import logging +import mysql.connector + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +def parse_arguments(): + parser = argparse.ArgumentParser(description='Database Patch Script') + # Arguments for the core databases server + parser.add_argument('--host', type=str, required=True, help='Core databases server host') + parser.add_argument('--port', type=int, default=3306, help='Core databases server port (default 3306)') + parser.add_argument('--user', type=str, required=True, help='Core databases server user') + parser.add_argument('--password', type=str, required=True, help='Core databases server password') + parser.add_argument('--mode', type=str, choices=['update', 'check', 'delete', 'full'], required=True, + help='Script mode') + # Arguments for the metadata database server + parser.add_argument('--meta_host', type=str, required=True, help='Metadata database server host') + parser.add_argument('--meta_port', type=int, default=3306, help='Metadata database server port (default 3306)') + parser.add_argument('--meta_user', type=str, required=True, help='Metadata database server user') + parser.add_argument('--meta_password', type=str, required=True, help='Metadata database server password') + parser.add_argument('--meta_database', type=str, required=True, help='Metadata database name') + + # Additional arguments for update mode + parser.add_argument('--patch_file', type=str, help='Output file for SQL patches (required for update mode)') + parser.add_argument('--database', type=str, required=False, help='Database name (required for update mode)') + + return parser.parse_args() + + +def generate_update_patch(host, port, user, password, database, patch_file): + try: + connection = mysql.connector.connect(host=host, port=port, user=user, password=password, database=database) + cursor = connection.cursor() + query = """SELECT DISTINCT ds.name, g.genome_uuid +FROM dataset_source ds +JOIN dataset d ON ds.dataset_source_id = d.dataset_source_id +JOIN genome_dataset gd ON d.dataset_id = gd.dataset_id +JOIN genome g ON gd.genome_id = g.genome_id +WHERE ds.type = 'core'""" + cursor.execute(query) + rows = cursor.fetchall() + + with open(patch_file, 'w') as file: + for row in rows: + source_name, genome_uuid = row + # Write SQL commands to the patch file instead of executing them + file.write(f"USE {source_name};\n") + file.write("DELETE FROM meta WHERE meta_key = 'genome.genome_uuid' AND species_id = 1;\n") + file.write( + f"INSERT INTO meta (species_id, meta_key, meta_value) VALUES (1, 'genome.genome_uuid', '{genome_uuid}');\n\n") + + cursor.close() + except mysql.connector.Error as e: + logging.error(f"Error: {e}") + finally: + connection.close() + + +def generate_delete_patch(core_host, core_port, core_user, core_password, meta_host, meta_port, meta_user, meta_password, metadata_database, patch_file): + mismatches = check_databases(core_host, core_port, core_user, core_password, meta_host, meta_port, meta_user, meta_password, metadata_database) + mismatches = [db_name for db_name, status in mismatches if status == 'mismatch'] + + try: + with open(patch_file, 'a') as file: + for db_name in mismatches: + # Write SQL commands to the patch file to delete mismatched entries + file.write(f"USE {db_name};\n") + file.write("DELETE FROM meta WHERE meta_key = 'genome.genome_uuid' AND species_id = 1;\n\n") + except Exception as e: + logging.error(f"Error while writing delete patch: {e}") + +def check_databases(core_host, core_port, core_user, core_password, meta_host, meta_port, meta_user, meta_password, metadata_database): + try: + core_conn = mysql.connector.connect(host=core_host, port=core_port, user=core_user, password=core_password) + core_cursor = core_conn.cursor() + core_cursor.execute("SHOW DATABASES LIKE '%core%'") + core_databases = core_cursor.fetchall() + meta_conn = mysql.connector.connect(host=meta_host, port=meta_port, user=meta_user, password=meta_password, database=metadata_database) + meta_cursor = meta_conn.cursor() + + results = [] + for (db_name,) in core_databases: + core_cursor.execute(f"USE {db_name};") + core_cursor.execute("SELECT meta_value FROM meta WHERE meta_key = 'genome.genome_uuid' AND species_id = 1") + core_uuid = core_cursor.fetchone() + + if core_uuid: + core_uuid = core_uuid[0] + meta_cursor.execute(f"SELECT genome_uuid FROM genome WHERE genome_uuid = '{core_uuid}'") + metadata_uuid = meta_cursor.fetchone() + + if metadata_uuid and metadata_uuid[0] == core_uuid: + results.append((db_name, 'match')) + else: + results.append((db_name, 'mismatch')) + else: + results.append((db_name, 'absent')) + + return results + + except mysql.connector.Error as e: + logging.error(f"Error: {e}") + finally: + if core_cursor and core_conn: + core_cursor.close() + core_conn.close() + if meta_cursor and meta_conn: + meta_cursor.close() + meta_conn.close() + return [] + +def main(): + args = parse_arguments() + + if args.mode == 'update': + if not args.patch_file or not args.database: + raise ValueError("Patch file name and database name are required for update mode") + generate_update_patch(args.meta_host, args.meta_port, args.meta_user, args.meta_password, args.meta_database, args.patch_file) + elif args.mode == 'check': + results = check_databases(args.host, args.port, args.user, args.password, args.meta_host, args.meta_port, args.meta_user, + args.meta_password, args.meta_database) + for db_name, status in results: + print(f"{db_name}: {status}") + elif args.mode == 'delete': + generate_delete_patch(args.host, args.port, args.user, args.password, args.meta_host, args.meta_port, args.meta_user, + args.meta_password, args.meta_database, args.patch_file) + elif args.mode == 'full': + print ("generating update patch") + generate_update_patch(args.meta_host, args.meta_port, args.meta_user, args.meta_password, args.meta_database, args.patch_file) + print ("generating delete patch") + generate_delete_patch(args.host, args.port, args.user, args.password, args.meta_host, args.meta_port, args.meta_user, + args.meta_password, args.meta_database, args.patch_file) + print("checking results") + results = check_databases(args.host, args.port, args.user, args.password, args.meta_host, args.meta_port, args.meta_user, + args.meta_password, args.meta_database) + absent_cores = [db_name for db_name, status in results if status == 'absent'] + print("Cores without genome_uuids:", absent_cores) + +if __name__ == "__main__": + main() diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index f431038b..6573a348 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -461,7 +461,6 @@ def get_or_new_assembly(self, species_id, meta_session, source=None, existing=No assembly_default=self.get_meta_single_meta_key(species_id, "assembly.default"), tol_id=tol_id, created=func.now(), - ensembl_name=self.get_meta_single_meta_key(species_id, "assembly.name"), assembly_uuid=str(uuid.uuid4()), url_name=self.get_meta_single_meta_key(species_id, "assembly.url_name"), is_reference=is_reference @@ -540,7 +539,31 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F dataset_source = source dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "genebuild").first() + + genebuild_start_date = self.get_meta_single_meta_key(species_id, "genebuild.start_date") + genebuild_provider_name = self.get_meta_single_meta_key(species_id, "genebuild.provider_name") + test_status = meta_session.query(Dataset).filter(Dataset.label == genebuild_accession).one_or_none() + if test_status: + # Check for genebuild.provider_name + provider_name_check = meta_session.query(DatasetAttribute).join(Attribute).filter( + DatasetAttribute.dataset_id == test_status.dataset_id, + Attribute.name == "genebuild.provider_name", + DatasetAttribute.value == genebuild_provider_name + ).one_or_none() + + if provider_name_check: + # Check for genebuild.start_date + start_date_check = meta_session.query(DatasetAttribute).join(Attribute).filter( + DatasetAttribute.dataset_id == test_status.dataset_id, + Attribute.name == "genebuild.start_date", + DatasetAttribute.value == genebuild_start_date + ).one_or_none() + + if start_date_check is None: + test_status = None + + if test_status is not None and existing is False: genebuild_dataset = test_status genebuild_dataset_attributes = genebuild_dataset.dataset_attributes diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt index b2afd577..d79088ef 100644 --- a/src/tests/databases/core_1/meta.txt +++ b/src/tests/databases/core_1/meta.txt @@ -19,4 +19,6 @@ 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test 21 1 assembly.test_value test -22 1 genebuild.test_value test \ No newline at end of file +22 1 genebuild.test_value test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt index d00958db..deadc76d 100644 --- a/src/tests/databases/core_2/meta.txt +++ b/src/tests/databases/core_2/meta.txt @@ -18,4 +18,6 @@ 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test -21 1 genome.genome_uuid test \ No newline at end of file +21 1 genome.genome_uuid test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt index fd2c682e..2ddb2633 100644 --- a/src/tests/databases/core_3/meta.txt +++ b/src/tests/databases/core_3/meta.txt @@ -16,4 +16,6 @@ 17 1 genebuild.version 1 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 -20 1 strain.type test \ No newline at end of file +20 1 strain.type test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index 990e4b74..75ed9796 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -17,4 +17,6 @@ 17 1 genebuild.version 2 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 -20 1 strain.type test \ No newline at end of file +20 1 strain.type test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_5/meta.txt b/src/tests/databases/core_5/meta.txt index 768e54bc..f7f399ce 100644 --- a/src/tests/databases/core_5/meta.txt +++ b/src/tests/databases/core_5/meta.txt @@ -14,3 +14,5 @@ 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_6/meta.txt b/src/tests/databases/core_6/meta.txt index a8a42b8a..76ac4c94 100644 --- a/src/tests/databases/core_6/meta.txt +++ b/src/tests/databases/core_6/meta.txt @@ -18,4 +18,6 @@ 18 1 sample.gene_param ENSAMXG00005000318 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test -21 1 genome.genome_uuid 90720316-006c-470b-a7dd-82d28f952264 \ No newline at end of file +21 1 genome.genome_uuid 90720316-006c-470b-a7dd-82d28f952264 +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_7/meta.txt b/src/tests/databases/core_7/meta.txt index b3733d86..ccbdeddc 100644 --- a/src/tests/databases/core_7/meta.txt +++ b/src/tests/databases/core_7/meta.txt @@ -19,4 +19,6 @@ 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test 21 1 assembly.test_value test2 -22 1 genebuild.test_value test2 \ No newline at end of file +22 1 genebuild.test_value test2 +23 1 genebuild.provider_name test +24 1 genebuild.start_date test diff --git a/src/tests/databases/core_8/meta.txt b/src/tests/databases/core_8/meta.txt index 3955b30e..9be1fae6 100644 --- a/src/tests/databases/core_8/meta.txt +++ b/src/tests/databases/core_8/meta.txt @@ -19,4 +19,6 @@ 19 1 sample.location_param KB871578.1:9766653-9817473 20 1 strain.type test 21 1 assembly.test_value test -22 1 genebuild.test_value test \ No newline at end of file +22 1 genebuild.test_value test +23 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/databases/core_9/meta.txt b/src/tests/databases/core_9/meta.txt index 10235dab..7d38e021 100644 --- a/src/tests/databases/core_9/meta.txt +++ b/src/tests/databases/core_9/meta.txt @@ -20,4 +20,6 @@ 20 1 strain.type test 21 1 assembly.test_value test3 22 1 genebuild.test_value test3 -23 1 genome.genome_uuid a733550b-93e7-11ec-a39d-005056b38ce3 \ No newline at end of file +23 1 genome.genome_uuid a733550b-93e7-11ec-a39d-005056b38ce3 +24 1 genebuild.provider_name test +24 1 genebuild.start_date test \ No newline at end of file diff --git a/src/tests/test_api.py b/src/tests/test_api.py index ce50c1a3..1b3b2404 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -48,9 +48,9 @@ def test_get_public_path_genebuild(self, multi_dbs): assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/genome' path = genome.get_public_path(type='variation') assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/variation' - path = genome.get_public_path(type='homology') + path = genome.get_public_path(type='homologies') assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/homology' - path = genome.get_public_path(type='regulation') + path = genome.get_public_path(type='regulatory_features') assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/regulation' def test_organism_ensembl_name_compat(self, multi_dbs):