Ensembl · dpopleton · Mar 12, 2024 · Mar 6, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py
@@ -30,8 +30,18 @@ def create_all_child_datasets(self, session, dataset_uuid):
         top_level_dataset = self.__get_dataset(session, dataset_uuid)
         self.__create_child_datasets_recursive(session, top_level_dataset)
 
-    def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label,
-                       version):
+    def create_dataset(self, session, genome_input, dataset_source, dataset_type, dataset_attributes, name, label,
+                       version, status="Submitted"):
+        # Check if genome_input is a UUID (string) or a Genome object
+        if isinstance(genome_input, str):
+            genome = session.query(Genome).filter(Genome.genome_uuid == genome_input).one()
+        elif isinstance(genome_input, Genome):
+            genome = genome_input
+        elif genome_input is None:
+            genome = None
+        else:
+            raise ValueError("Invalid genome input. Must be either a UUID string or a Genome object.")
+
         new_dataset = Dataset(
             dataset_uuid=str(uuid.uuid4()),
             dataset_type=dataset_type,  # Must be an object returned from the current session
@@ -40,18 +50,24 @@ def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dat
             label=label,
             created=func.now(),
             dataset_source=dataset_source,  # Must
-            status="Submitted",
+            status=status,
         )
-        genome = session.query(Genome).filter(Genome.genome_uuid == genome_uuid).one()
-        new_genome_dataset = GenomeDataset(
-            genome=genome,
-            dataset=new_dataset,
-            is_current=False,
-        )
-        new_dataset_attributes = update_attributes(new_dataset, dataset_attributes, session)
-        session.add(new_genome_dataset)
+        if dataset_attributes is not None:
+            new_dataset_attributes = update_attributes(new_dataset, dataset_attributes, session)
+        else:
+            new_dataset_attributes = None
         dataset_uuid = new_dataset.dataset_uuid
-        return dataset_uuid, new_dataset_attributes, new_genome_dataset
+
+        if genome is not None:
+            new_genome_dataset = GenomeDataset(
+                genome=genome,
+                dataset=new_dataset,
+                is_current=False,
+            )
+            session.add(new_genome_dataset)
+            return dataset_uuid, new_dataset, new_dataset_attributes, new_genome_dataset
+        else:
+            return dataset_uuid, new_dataset, new_dataset_attributes, None
 
     def get_parent_datasets(self, dataset_uuid, **kwargs):
         session = kwargs.get('session')
@@ -122,7 +138,14 @@ def __create_child_datasets_recursive(self, session, parent_dataset):
             DatasetType.parent == parent_dataset_type.dataset_type_id).all()
 
         for child_type in child_dataset_types:
-            # Example placeholders for dataset properties
+            # Check if a dataset with the same type and genome exists
+            existing_datasets = session.query(Dataset).join(GenomeDataset).filter(
+                Dataset.dataset_type_id == child_type.dataset_type_id,
+                GenomeDataset.genome_id.in_([gd.genome_id for gd in parent_dataset.genome_datasets])
+            ).all()
+            if any(d.status in ['Submitted', 'Processing'] for d in existing_datasets):
+                continue  # Skip creation if any dataset is already Processed or Released
+
             if len(parent_dataset.genome_datasets) > 1:
                 raise ValueError("More than one genome linked to a genome_dataset")
 
@@ -136,7 +159,8 @@ def __create_child_datasets_recursive(self, session, parent_dataset):
             version = None
 
             # Create the child dataset
-            child_dataset_uuid, new_dataset_attributes, new_genome_dataset = self.create_dataset(session, genome_uuid,
+            child_dataset_uuid, new_dataset, new_dataset_attributes, new_genome_dataset = self.create_dataset(session,
+                                                                                                              genome_uuid,
                                                                                                  dataset_source,
                                                                                                  dataset_type,
                                                                                                  dataset_attributes,

diff --git a/src/ensembl/production/metadata/scripts/genome_uuid_manager.py b/src/ensembl/production/metadata/scripts/genome_uuid_manager.py
@@ -1,5 +1,18 @@
+# See the NOTICE file distributed with this work for additional information
+#   regarding copyright ownership.
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#       http://www.apache.org/licenses/LICENSE-2.0
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
 import argparse
 import logging
+
 import mysql.connector
 
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py
@@ -9,14 +9,11 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
-import sqlalchemy as db
+from ensembl.database import DBConnection
 from sqlalchemy import inspect
 from sqlalchemy.engine import make_url
 
-from ensembl.core.models import Meta
-from ensembl.production.metadata.api.exceptions import UpdaterException
-from ensembl.production.metadata.api.models import DatasetSource, Attribute, DatasetAttribute, Dataset
-from ensembl.database import DBConnection
+from ensembl.production.metadata.api.models import DatasetSource
 from ensembl.production.metadata.api.models import EnsemblRelease
 
 
@@ -44,9 +41,11 @@ def is_object_new(self, obj):
         insp = inspect(obj)
         return insp.transient or insp.pending
 
-    def get_or_new_source(self, meta_session, db_type):
+    def get_or_new_source(self, meta_session, db_type, name=None):
         db_uri = self.db_uri
-        name = make_url(db_uri).database
+        if name is None:
+            # For core databases
+            name = make_url(db_uri).database
         dataset_source = meta_session.query(DatasetSource).filter(DatasetSource.name == name).one_or_none()
         if dataset_source is None:
             dataset_source = DatasetSource(