Ensembl · dpopleton · Aug 9, 2023 · Jul 11, 2023 · Jul 12, 2023 · Jul 13, 2023
diff --git a/sql/assembly_sequence.sql b/sql/assembly_sequence.sql
@@ -1,11 +1,11 @@
--- Because the attrib_type and external_db tables are identical
+-- Because the attrib_type.txt and external_db tables are identical
 -- across all dbs, and in sync with the production master copy,
 -- we can use IDs directly, and avoid some complicated outer
 -- join statements...
 -- external_db.external_db_id 50710 = INSDC
--- attrib_type.attrib_type_id 6 = toplevel
--- attrib_type.attrib_type_id 367 = karyotype_rank
--- attrib_type.attrib_type_id 547 = sequence_location
+-- attrib_type.txt.attrib_type_id 6 = toplevel
+-- attrib_type.txt.attrib_type_id 367 = karyotype_rank
+-- attrib_type.txt.attrib_type_id 547 = sequence_location
 
 -- Unfortunately, the sequence_location attribute in the core dbs
 -- isn't set with the values you might expect; it has '*_chromosome'

diff --git a/src/ensembl/production/metadata/api/base.py b/src/ensembl/production/metadata/api/base.py
@@ -12,12 +12,19 @@
 from ensembl.database import DBConnection
 
 
+##Todo: Add in OrganismAdapator. Subfunction fetches all organism in popular group. and # of genomes from distinct assemblies.
+#Add in best genome (see doc)
+#More functions for related genomes
+
+
 class BaseAdaptor:
     def __init__(self, metadata_uri):
         self.metadata_db = DBConnection(metadata_uri)
 
 
 def check_parameter(param):
+    if  isinstance(param, tuple):
+        param = param[0]
     if param is not None and not isinstance(param, list):
         param = [param]
     return param
diff --git a/src/ensembl/production/metadata/api/dataset.py b/src/ensembl/production/metadata/api/dataset.py
@@ -0,0 +1,38 @@
+# See the NOTICE file distributed with this work for additional information
+#   regarding copyright ownership.
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#       http://www.apache.org/licenses/LICENSE-2.0
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+import sqlalchemy as db
+from sqlalchemy.engine import make_url
+
+from ensembl.production.metadata.api.base import BaseAdaptor
+from ensembl.production.metadata.api.models import GenomeDataset, Dataset
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class DatasetAdaptor(BaseAdaptor):
+    def __init__(self, metadata_uri):
+        super().__init__(metadata_uri)
+
+    def check_release_status(self, dataset_uuid):
+        with self.metadata_db.session_scope() as session:
+            # Query to check if a release_id exists for the given genome_uuid
+            dataset_id = session.query(Dataset.dataset_id).filter(Dataset.dataset_uuid == dataset_uuid).scalar()
+            if dataset_id is None:
+                return "UUID not found"
+
+            # Now we check if there exists a genome dataset with the corresponding dataset_id and a non-null release_id
+            result = session.query(
+                session.query(GenomeDataset).filter(GenomeDataset.dataset_id == dataset_id,
+                                                    GenomeDataset.release_id.isnot(None)).exists()
+            ).scalar()
+            return result
diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py
@@ -30,11 +30,11 @@ def __init__(self, metadata_uri, taxonomy_uri=None):
 
     def fetch_taxonomy_names(self, taxonomy_ids):
 
+        taxonomy_ids = check_parameter(taxonomy_ids)
         taxons = {}
         for tid in taxonomy_ids:
             names = {"scientific_name": None, "synonym": []}
             taxons[tid] = names
-
         for taxon in taxons:
             sci_name_select = db.select(
                 NCBITaxaName.name

diff --git a/src/ensembl/production/metadata/api/models/assembly.py b/src/ensembl/production/metadata/api/models/assembly.py
@@ -35,9 +35,9 @@ class Assembly(Base):
     alt_accession = Column(String(16), nullable=True)
     # One to many relationships
     # assembly_id within assembly_sequence
-    assembly_sequences = relationship("AssemblySequence", back_populates="assembly")
+    assembly_sequences = relationship("AssemblySequence", back_populates="assembly", cascade="all, delete, delete-orphan")
     # assembly_id within genome
-    genomes = relationship("Genome", back_populates="assembly")
+    genomes = relationship("Genome", back_populates="assembly", cascade="all, delete, delete-orphan")
 
 
 class AssemblySequence(Base):

diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py
@@ -47,8 +47,8 @@ class Dataset(Base):
 
     # One to many relationships
     # dataset_id to dataset attribute and genome dataset
-    dataset_attributes = relationship("DatasetAttribute", back_populates='dataset')
-    genome_datasets = relationship("GenomeDataset", back_populates='dataset')
+    dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', cascade="all, delete, delete-orphan")
+    genome_datasets = relationship("GenomeDataset", back_populates='dataset', cascade="all, delete, delete-orphan")
     # many to one relationships
     # dataset_type_id to dataset_type
     dataset_type = relationship('DatasetType', back_populates="datasets")

diff --git a/src/ensembl/production/metadata/api/models/genome.py b/src/ensembl/production/metadata/api/models/genome.py
@@ -28,8 +28,8 @@ class Genome(Base):
     created = Column(DATETIME(fsp=6), nullable=False)
     # One to many relationships
     # genome_id to genome_dataset and genome release
-    genome_datasets = relationship("GenomeDataset", back_populates="genome")
-    genome_releases = relationship("GenomeRelease", back_populates="genome")
+    genome_datasets = relationship("GenomeDataset", back_populates="genome", cascade="all, delete, delete-orphan")
+    genome_releases = relationship("GenomeRelease", back_populates="genome", cascade="all, delete, delete-orphan")
     # many to one relationships
     # assembly_id to assembly
     assembly = relationship("Assembly", back_populates="genomes")

diff --git a/src/ensembl/production/metadata/api/models/organism.py b/src/ensembl/production/metadata/api/models/organism.py
@@ -33,7 +33,7 @@ class Organism(Base):
     scientific_parlance_name = Column(String(255))
     # One to many relationships
     # Organism_id to organism_group_member and genome
-    genomes = relationship("Genome", back_populates="organism")
+    genomes = relationship("Genome", back_populates="organism", cascade="all, delete, delete-orphan")
     organism_group_members = relationship("OrganismGroupMember", back_populates="organism")
 
     # many to one relationships

diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt
@@ -56,13 +56,15 @@
 56	homology_coverage	Coverage	Percent of genome which is homologous to another species	percent
 57	short_variants	Short variants	Small-scale genetic variations	integer
 58	structural_variants	Structural variants	Large-scale genetic variations	integer
-59	short_variants_with_phenotype_assertions	"Short variants
-With phenotype assertions"	Short variants with phenotypic evidence	integer
-60	short_variants_with_publications	"Short variants
-With publications"	Short variants published in literature	integer
-61	short_variants_frequency_studies	"Short variants
-Frequency studies"	Short variants studied for frequency	integer
-62	structural_variants_with_phenotype_assertions	"Structural variants
-With phenotype assertions"	Structural variants with phenotypic evidence	integer
+59	short_variants_with_phenotype_assertions	"Short variants With phenotype assertions"	Short variants with phenotypic evidence	integer
+60	short_variants_with_publications	"Short variants With publications"	Short variants published in literature	integer
+61	short_variants_frequency_studies	"Short variants Frequency studies"	Short variants studied for frequency	integer
+62	structural_variants_with_phenotype_assertions	"Structural variants With phenotype assertions"	Structural variants with phenotypic evidence	integer
 63	enhancers	Enhancers	DNA sequences that increase gene expression	integer
 64	promoters	Promoters	DNA sequences initiating transcription	integer
+65	assembly.accession	accession	accession	string
+66	assembly.default	default	default	string
+67	assembly.name	name	name	string
+68	assembly.ucsc_alias	ucsc_alias	ucsc_alias	string
+69	genebuild.last_geneset_update	last_geneset_update	last_geneset_update	string
+70	genebuild.version	version	version	string
diff --git a/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt b/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt
@@ -321,6 +321,10 @@
 2301119	Rhabditomorpha	scientific name
 2698737	Sar Burki et al. 2008	authority
 2698737	Sar	scientific name
+666668	jabberwocky	synonym
+666668	carol_jabberwocky3	equivalent name
+666668	carol_jabberwocky2	equivalent name
+666668	carol_jabberwocky	scientific name
 2698737	SAR supergroup	synonym
 38820	4478	merged_taxon_id
 38820	4727	merged_taxon_id

diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py
@@ -13,6 +13,7 @@
 from sqlalchemy.engine import make_url
 
 from ensembl.core.models import Meta
+from ensembl.production.metadata.api.models import DatasetSource
 from ensembl.database import DBConnection
 from ensembl.production.metadata.api.models import EnsemblRelease
 
@@ -21,17 +22,15 @@ class BaseMetaUpdater:
     def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None):
         self.db_uri = db_uri
         self.db = DBConnection(self.db_uri)
-        self.species = None
-        self.db_type = None
+        self.metadata_db = DBConnection(metadata_uri)
         # We will add a release later. For now, the release must be specified for it to be used.
         if release is None:
             self.listed_release = None
             self.listed_release_is_current = None
         else:
             self.listed_release = release
             self.listed_release_is_current = EnsemblRelease.is_current
-        self.metadata_db = DBConnection(metadata_uri)
-        self.taxonomy_uri = taxonomy_uri
+
 
     # Basic API for the meta table in the submission database.
     def get_meta_single_meta_key(self, species_id, parameter):
@@ -43,4 +42,29 @@ def get_meta_single_meta_key(self, species_id, parameter):
             else:
                 return result[0]
 
+    def get_meta_list_from_prefix_meta_key(self, species_id, prefix):
+        with self.db.session_scope() as session:
+            query = session.query(Meta.meta_key, Meta.meta_value).filter(
+                Meta.meta_key.like(f'{prefix}%'),
+                Meta.species_id == species_id
+            )
+            result = query.all()
+            if not result:
+                return {}
+            else:
+                # Build a dictionary out of the results.
+                result_dict = {key: value for key, value in result}
+                return result_dict
 
+    def get_or_new_source(self, meta_session, db_uri, db_type):
+        name = make_url(db_uri).database
+        dataset_source = meta_session.query(DatasetSource).filter(DatasetSource.name == name).one_or_none()
+        if dataset_source is None:
+            dataset_source = DatasetSource(
+                type=db_type,  # core/fungen etc
+                name=name  # dbname
+            )
+            meta_session.add(dataset_source)  # Only add a new DatasetSource to the session if it doesn't exist
+            return dataset_source, "new"
-            return dataset_source, "new"
+            return dataset_source, "New"
-            return dataset_source, "new"
+            return dataset_source, "New"
+        else:
+            return dataset_source, "existing"
-            return dataset_source, "existing"
+            return dataset_source, "Existing"
-            return dataset_source, "existing"
+            return dataset_source, "Existing"