Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/ensembl/production/metadata/api/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.


class MetaException(Exception):
"""Base Metadata API Exception class"""
pass
Expand Down Expand Up @@ -43,3 +44,7 @@ class MissingMetaException(MetaException, RuntimeError):
class UpdateBackCoreException(UpdaterException, RuntimeError):
"""An error occurred while updating back the core database"""
pass

class TypeNotFoundException(UpdaterException, RuntimeError):
"""Dataset Type not found"""
pass
39 changes: 30 additions & 9 deletions src/ensembl/production/metadata/api/models/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from sqlalchemy import Column, Integer, String, ForeignKey
from sqlalchemy.dialects.mysql import DATETIME, TINYINT
from sqlalchemy.orm import relationship

from ensembl.production.metadata.api.exceptions import *
from ensembl.production.metadata.api.models.base import Base, LoadAble


Expand Down Expand Up @@ -54,20 +54,41 @@ def get_public_path(self, type='all', release=None):

genebuild_source_name = genebuild_annotation_source_attribute.value
common_path = f"{self.organism.scientific_name.replace(' ', '_')}/{self.assembly.accession}/{genebuild_source_name}"
unique_dataset_types = {gd.dataset.dataset_type.name for gd in self.genome_datasets}

if 'regulatory_features' in unique_dataset_types or 'regulation_build' in unique_dataset_types:
unique_dataset_types.discard('regulatory_features')
unique_dataset_types.discard('regulation_build')
unique_dataset_types.add('regulation')
if 'regulatory_features' == type or 'regulation_build' == type:
type = 'regulation'

if type in ['genebuild', 'assembly', 'homology', 'regulation', 'variation', 'all']:
if type in unique_dataset_types or type == 'all':
if type == 'genebuild':
paths.append(f"{common_path}/genebuild/{genebuild_dataset.dataset.version}")
elif type == 'assembly':
paths.append(f"{common_path}/genome")
elif type in ['homology', 'regulation', 'variation']:
paths.append(f"{common_path}/{type}")
elif type == 'homologies':
paths.append(f"{common_path}/homology")
elif type == 'regulation':
paths.append(f"{common_path}/regulation")
elif type == 'variation':
paths.append(f"{common_path}/variation")
elif type == 'all':
# Add paths for all types
for t in ['genebuild', 'assembly', 'homology', 'regulation', 'variation']:
paths.extend(self.get_public_path(type=t))
return paths

for t in unique_dataset_types:
if t == 'genebuild':
paths.append(f"{common_path}/genebuild/{genebuild_dataset.dataset.version}")
elif t == 'assembly':
paths.append(f"{common_path}/genome")
elif t == 'homologies':
paths.append(f"{common_path}/homology")
elif t in ['regulation', 'variation']:
paths.append(f"{common_path}/{t}")
else:
raise TypeNotFoundException(f"Dataset Type : {type} has no associated path. ")
return paths
else:
raise TypeNotFoundException(f"Dataset Type : {type} not found in metadata. ")


class GenomeDataset(LoadAble, Base):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,5 @@
36 1068ba70-0088-4927-98bd-8fabcfb9a384 4 evidence \N 2023-06-02 13:32:52 10 Manual Add Submitted
38 47d54c33-80d6-4174-8620-52b6c8506db2 6 homologies \N 2023-06-02 13:32:52 11 Manual Add Submitted
42 ea044d8e-33f1-4c9f-9b9f-8c0bd1dcf642 6 homologies \N 2023-06-02 13:32:52 11 Manual Add Submitted
46 385f1ec2-bd06-40ce-873a-98e199f10505 1 asssembly \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted
46 385f1ec2-bd06-40ce-873a-98e199f10505 1 asssembly \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted
47 385f1ec2-bd06-40ce-873a-98e199f10534 5 regulation_build \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@
57 44 7 \N 0
58 45 7 \N 0
59 46 9 \N 0
60 47 6 1 1
139 changes: 139 additions & 0 deletions src/ensembl/production/metadata/scripts/genome_uuid_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import argparse
import logging
import mysql.connector

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def parse_arguments():
parser = argparse.ArgumentParser(description='Database Patch Script')
# Arguments for the core databases server
parser.add_argument('--host', type=str, required=True, help='Core databases server host')
parser.add_argument('--port', type=int, default=3306, help='Core databases server port (default 3306)')
parser.add_argument('--user', type=str, required=True, help='Core databases server user')
parser.add_argument('--password', type=str, required=True, help='Core databases server password')
parser.add_argument('--mode', type=str, choices=['update', 'check', 'delete', 'full'], required=True,
help='Script mode')
# Arguments for the metadata database server
parser.add_argument('--meta_host', type=str, required=True, help='Metadata database server host')
parser.add_argument('--meta_port', type=int, default=3306, help='Metadata database server port (default 3306)')
parser.add_argument('--meta_user', type=str, required=True, help='Metadata database server user')
parser.add_argument('--meta_password', type=str, required=True, help='Metadata database server password')
parser.add_argument('--meta_database', type=str, required=True, help='Metadata database name')

# Additional arguments for update mode
parser.add_argument('--patch_file', type=str, help='Output file for SQL patches (required for update mode)')
parser.add_argument('--database', type=str, required=False, help='Database name (required for update mode)')

return parser.parse_args()


def generate_update_patch(host, port, user, password, database, patch_file):
try:
connection = mysql.connector.connect(host=host, port=port, user=user, password=password, database=database)
cursor = connection.cursor()
query = """SELECT DISTINCT ds.name, g.genome_uuid
FROM dataset_source ds
JOIN dataset d ON ds.dataset_source_id = d.dataset_source_id
JOIN genome_dataset gd ON d.dataset_id = gd.dataset_id
JOIN genome g ON gd.genome_id = g.genome_id
WHERE ds.type = 'core'"""
cursor.execute(query)
rows = cursor.fetchall()

with open(patch_file, 'w') as file:
for row in rows:
source_name, genome_uuid = row
# Write SQL commands to the patch file instead of executing them
file.write(f"USE {source_name};\n")
file.write("DELETE FROM meta WHERE meta_key = 'genome.genome_uuid' AND species_id = 1;\n")
file.write(
f"INSERT INTO meta (species_id, meta_key, meta_value) VALUES (1, 'genome.genome_uuid', '{genome_uuid}');\n\n")

cursor.close()
except mysql.connector.Error as e:
logging.error(f"Error: {e}")
finally:
connection.close()


def generate_delete_patch(core_host, core_port, core_user, core_password, meta_host, meta_port, meta_user, meta_password, metadata_database, patch_file):
mismatches = check_databases(core_host, core_port, core_user, core_password, meta_host, meta_port, meta_user, meta_password, metadata_database)
mismatches = [db_name for db_name, status in mismatches if status == 'mismatch']

try:
with open(patch_file, 'a') as file:
for db_name in mismatches:
# Write SQL commands to the patch file to delete mismatched entries
file.write(f"USE {db_name};\n")
file.write("DELETE FROM meta WHERE meta_key = 'genome.genome_uuid' AND species_id = 1;\n\n")
except Exception as e:
logging.error(f"Error while writing delete patch: {e}")

def check_databases(core_host, core_port, core_user, core_password, meta_host, meta_port, meta_user, meta_password, metadata_database):
try:
core_conn = mysql.connector.connect(host=core_host, port=core_port, user=core_user, password=core_password)
core_cursor = core_conn.cursor()
core_cursor.execute("SHOW DATABASES LIKE '%core%'")
core_databases = core_cursor.fetchall()
meta_conn = mysql.connector.connect(host=meta_host, port=meta_port, user=meta_user, password=meta_password, database=metadata_database)
meta_cursor = meta_conn.cursor()

results = []
for (db_name,) in core_databases:
core_cursor.execute(f"USE {db_name};")
core_cursor.execute("SELECT meta_value FROM meta WHERE meta_key = 'genome.genome_uuid' AND species_id = 1")
core_uuid = core_cursor.fetchone()

if core_uuid:
core_uuid = core_uuid[0]
meta_cursor.execute(f"SELECT genome_uuid FROM genome WHERE genome_uuid = '{core_uuid}'")
metadata_uuid = meta_cursor.fetchone()

if metadata_uuid and metadata_uuid[0] == core_uuid:
results.append((db_name, 'match'))
else:
results.append((db_name, 'mismatch'))
else:
results.append((db_name, 'absent'))

return results

except mysql.connector.Error as e:
logging.error(f"Error: {e}")
finally:
if core_cursor and core_conn:
core_cursor.close()
core_conn.close()
if meta_cursor and meta_conn:
meta_cursor.close()
meta_conn.close()
return []

def main():
args = parse_arguments()

if args.mode == 'update':
if not args.patch_file or not args.database:
raise ValueError("Patch file name and database name are required for update mode")
generate_update_patch(args.meta_host, args.meta_port, args.meta_user, args.meta_password, args.meta_database, args.patch_file)
elif args.mode == 'check':
results = check_databases(args.host, args.port, args.user, args.password, args.meta_host, args.meta_port, args.meta_user,
args.meta_password, args.meta_database)
for db_name, status in results:
print(f"{db_name}: {status}")
elif args.mode == 'delete':
generate_delete_patch(args.host, args.port, args.user, args.password, args.meta_host, args.meta_port, args.meta_user,
args.meta_password, args.meta_database, args.patch_file)
elif args.mode == 'full':
print ("generating update patch")
generate_update_patch(args.meta_host, args.meta_port, args.meta_user, args.meta_password, args.meta_database, args.patch_file)
print ("generating delete patch")
generate_delete_patch(args.host, args.port, args.user, args.password, args.meta_host, args.meta_port, args.meta_user,
args.meta_password, args.meta_database, args.patch_file)
print("checking results")
results = check_databases(args.host, args.port, args.user, args.password, args.meta_host, args.meta_port, args.meta_user,
args.meta_password, args.meta_database)
absent_cores = [db_name for db_name, status in results if status == 'absent']
print("Cores without genome_uuids:", absent_cores)

if __name__ == "__main__":
main()
25 changes: 24 additions & 1 deletion src/ensembl/production/metadata/updater/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,6 @@ def get_or_new_assembly(self, species_id, meta_session, source=None, existing=No
assembly_default=self.get_meta_single_meta_key(species_id, "assembly.default"),
tol_id=tol_id,
created=func.now(),
ensembl_name=self.get_meta_single_meta_key(species_id, "assembly.name"),
assembly_uuid=str(uuid.uuid4()),
url_name=self.get_meta_single_meta_key(species_id, "assembly.url_name"),
is_reference=is_reference
Expand Down Expand Up @@ -540,7 +539,31 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F
dataset_source = source

dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "genebuild").first()

genebuild_start_date = self.get_meta_single_meta_key(species_id, "genebuild.start_date")
genebuild_provider_name = self.get_meta_single_meta_key(species_id, "genebuild.provider_name")

test_status = meta_session.query(Dataset).filter(Dataset.label == genebuild_accession).one_or_none()
if test_status:
# Check for genebuild.provider_name
provider_name_check = meta_session.query(DatasetAttribute).join(Attribute).filter(
DatasetAttribute.dataset_id == test_status.dataset_id,
Attribute.name == "genebuild.provider_name",
DatasetAttribute.value == genebuild_provider_name
).one_or_none()

if provider_name_check:
# Check for genebuild.start_date
start_date_check = meta_session.query(DatasetAttribute).join(Attribute).filter(
DatasetAttribute.dataset_id == test_status.dataset_id,
Attribute.name == "genebuild.start_date",
DatasetAttribute.value == genebuild_start_date
).one_or_none()

if start_date_check is None:
test_status = None


if test_status is not None and existing is False:
genebuild_dataset = test_status
genebuild_dataset_attributes = genebuild_dataset.dataset_attributes
Expand Down
4 changes: 3 additions & 1 deletion src/tests/databases/core_1/meta.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@
19 1 sample.location_param KB871578.1:9766653-9817473
20 1 strain.type test
21 1 assembly.test_value test
22 1 genebuild.test_value test
22 1 genebuild.test_value test
23 1 genebuild.provider_name test
24 1 genebuild.start_date test
4 changes: 3 additions & 1 deletion src/tests/databases/core_2/meta.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@
18 1 sample.gene_param ENSAMXG00005000318
19 1 sample.location_param KB871578.1:9766653-9817473
20 1 strain.type test
21 1 genome.genome_uuid test
21 1 genome.genome_uuid test
23 1 genebuild.provider_name test
24 1 genebuild.start_date test
4 changes: 3 additions & 1 deletion src/tests/databases/core_3/meta.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,6 @@
17 1 genebuild.version 1
18 1 sample.gene_param ENSAMXG00005000318
19 1 sample.location_param KB871578.1:9766653-9817473
20 1 strain.type test
20 1 strain.type test
23 1 genebuild.provider_name test
24 1 genebuild.start_date test
4 changes: 3 additions & 1 deletion src/tests/databases/core_4/meta.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,6 @@
17 1 genebuild.version 2
18 1 sample.gene_param ENSAMXG00005000318
19 1 sample.location_param KB871578.1:9766653-9817473
20 1 strain.type test
20 1 strain.type test
23 1 genebuild.provider_name test
24 1 genebuild.start_date test
2 changes: 2 additions & 0 deletions src/tests/databases/core_5/meta.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@
18 1 sample.gene_param ENSAMXG00005000318
19 1 sample.location_param KB871578.1:9766653-9817473
20 1 strain.type test
23 1 genebuild.provider_name test
24 1 genebuild.start_date test
4 changes: 3 additions & 1 deletion src/tests/databases/core_6/meta.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@
18 1 sample.gene_param ENSAMXG00005000318
19 1 sample.location_param KB871578.1:9766653-9817473
20 1 strain.type test
21 1 genome.genome_uuid 90720316-006c-470b-a7dd-82d28f952264
21 1 genome.genome_uuid 90720316-006c-470b-a7dd-82d28f952264
23 1 genebuild.provider_name test
24 1 genebuild.start_date test
4 changes: 3 additions & 1 deletion src/tests/databases/core_7/meta.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@
19 1 sample.location_param KB871578.1:9766653-9817473
20 1 strain.type test
21 1 assembly.test_value test2
22 1 genebuild.test_value test2
22 1 genebuild.test_value test2
23 1 genebuild.provider_name test
24 1 genebuild.start_date test
4 changes: 3 additions & 1 deletion src/tests/databases/core_8/meta.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@
19 1 sample.location_param KB871578.1:9766653-9817473
20 1 strain.type test
21 1 assembly.test_value test
22 1 genebuild.test_value test
22 1 genebuild.test_value test
23 1 genebuild.provider_name test
24 1 genebuild.start_date test
4 changes: 3 additions & 1 deletion src/tests/databases/core_9/meta.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,6 @@
20 1 strain.type test
21 1 assembly.test_value test3
22 1 genebuild.test_value test3
23 1 genome.genome_uuid a733550b-93e7-11ec-a39d-005056b38ce3
23 1 genome.genome_uuid a733550b-93e7-11ec-a39d-005056b38ce3
24 1 genebuild.provider_name test
24 1 genebuild.start_date test
4 changes: 2 additions & 2 deletions src/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def test_get_public_path_genebuild(self, multi_dbs):
assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/genome'
path = genome.get_public_path(type='variation')
assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/variation'
path = genome.get_public_path(type='homology')
path = genome.get_public_path(type='homologies')
assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/homology'
path = genome.get_public_path(type='regulation')
path = genome.get_public_path(type='regulatory_features')
assert path[0] == 'Saccharomyces_cerevisiae_S288c/GCA_000146045.2/test_anno_source/regulation'

def test_organism_ensembl_name_compat(self, multi_dbs):
Expand Down