Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions sql/assembly_sequence.sql
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
-- Because the attrib_type and external_db tables are identical
-- Because the attrib_type.txt and external_db tables are identical
-- across all dbs, and in sync with the production master copy,
-- we can use IDs directly, and avoid some complicated outer
-- join statements...
-- external_db.external_db_id 50710 = INSDC
-- attrib_type.attrib_type_id 6 = toplevel
-- attrib_type.attrib_type_id 367 = karyotype_rank
-- attrib_type.attrib_type_id 547 = sequence_location
-- attrib_type.txt.attrib_type_id 6 = toplevel
-- attrib_type.txt.attrib_type_id 367 = karyotype_rank
-- attrib_type.txt.attrib_type_id 547 = sequence_location

-- Unfortunately, the sequence_location attribute in the core dbs
-- isn't set with the values you might expect; it has '*_chromosome'
Expand Down
7 changes: 7 additions & 0 deletions src/ensembl/production/metadata/api/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,19 @@
from ensembl.database import DBConnection


##Todo: Add in OrganismAdapator. Subfunction fetches all organism in popular group. and # of genomes from distinct assemblies.
#Add in best genome (see doc)
#More functions for related genomes


class BaseAdaptor:
def __init__(self, metadata_uri):
self.metadata_db = DBConnection(metadata_uri)


def check_parameter(param):
if isinstance(param, tuple):
param = param[0]
if param is not None and not isinstance(param, list):
param = [param]
return param
38 changes: 38 additions & 0 deletions src/ensembl/production/metadata/api/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sqlalchemy as db
from sqlalchemy.engine import make_url

from ensembl.production.metadata.api.base import BaseAdaptor
from ensembl.production.metadata.api.models import GenomeDataset, Dataset
import logging

logger = logging.getLogger(__name__)


class DatasetAdaptor(BaseAdaptor):
def __init__(self, metadata_uri):
super().__init__(metadata_uri)

def check_release_status(self, dataset_uuid):
with self.metadata_db.session_scope() as session:
# Query to check if a release_id exists for the given genome_uuid
dataset_id = session.query(Dataset.dataset_id).filter(Dataset.dataset_uuid == dataset_uuid).scalar()
if dataset_id is None:
return "UUID not found"

# Now we check if there exists a genome dataset with the corresponding dataset_id and a non-null release_id
result = session.query(
session.query(GenomeDataset).filter(GenomeDataset.dataset_id == dataset_id,
GenomeDataset.release_id.isnot(None)).exists()
).scalar()
return result
2 changes: 1 addition & 1 deletion src/ensembl/production/metadata/api/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ def __init__(self, metadata_uri, taxonomy_uri=None):

def fetch_taxonomy_names(self, taxonomy_ids):

taxonomy_ids = check_parameter(taxonomy_ids)
taxons = {}
for tid in taxonomy_ids:
names = {"scientific_name": None, "synonym": []}
taxons[tid] = names

for taxon in taxons:
sci_name_select = db.select(
NCBITaxaName.name
Expand Down
4 changes: 2 additions & 2 deletions src/ensembl/production/metadata/api/models/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ class Assembly(Base):
alt_accession = Column(String(16), nullable=True)
# One to many relationships
# assembly_id within assembly_sequence
assembly_sequences = relationship("AssemblySequence", back_populates="assembly")
assembly_sequences = relationship("AssemblySequence", back_populates="assembly", cascade="all, delete, delete-orphan")
# assembly_id within genome
genomes = relationship("Genome", back_populates="assembly")
genomes = relationship("Genome", back_populates="assembly", cascade="all, delete, delete-orphan")


class AssemblySequence(Base):
Expand Down
4 changes: 2 additions & 2 deletions src/ensembl/production/metadata/api/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ class Dataset(Base):

# One to many relationships
# dataset_id to dataset attribute and genome dataset
dataset_attributes = relationship("DatasetAttribute", back_populates='dataset')
genome_datasets = relationship("GenomeDataset", back_populates='dataset')
dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', cascade="all, delete, delete-orphan")
genome_datasets = relationship("GenomeDataset", back_populates='dataset', cascade="all, delete, delete-orphan")
# many to one relationships
# dataset_type_id to dataset_type
dataset_type = relationship('DatasetType', back_populates="datasets")
Expand Down
4 changes: 2 additions & 2 deletions src/ensembl/production/metadata/api/models/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ class Genome(Base):
created = Column(DATETIME(fsp=6), nullable=False)
# One to many relationships
# genome_id to genome_dataset and genome release
genome_datasets = relationship("GenomeDataset", back_populates="genome")
genome_releases = relationship("GenomeRelease", back_populates="genome")
genome_datasets = relationship("GenomeDataset", back_populates="genome", cascade="all, delete, delete-orphan")
genome_releases = relationship("GenomeRelease", back_populates="genome", cascade="all, delete, delete-orphan")
# many to one relationships
# assembly_id to assembly
assembly = relationship("Assembly", back_populates="genomes")
Expand Down
2 changes: 1 addition & 1 deletion src/ensembl/production/metadata/api/models/organism.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class Organism(Base):
scientific_parlance_name = Column(String(255))
# One to many relationships
# Organism_id to organism_group_member and genome
genomes = relationship("Genome", back_populates="organism")
genomes = relationship("Genome", back_populates="organism", cascade="all, delete, delete-orphan")
organism_group_members = relationship("OrganismGroupMember", back_populates="organism")

# many to one relationships
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,15 @@
56 homology_coverage Coverage Percent of genome which is homologous to another species percent
57 short_variants Short variants Small-scale genetic variations integer
58 structural_variants Structural variants Large-scale genetic variations integer
59 short_variants_with_phenotype_assertions "Short variants
With phenotype assertions" Short variants with phenotypic evidence integer
60 short_variants_with_publications "Short variants
With publications" Short variants published in literature integer
61 short_variants_frequency_studies "Short variants
Frequency studies" Short variants studied for frequency integer
62 structural_variants_with_phenotype_assertions "Structural variants
With phenotype assertions" Structural variants with phenotypic evidence integer
59 short_variants_with_phenotype_assertions "Short variants With phenotype assertions" Short variants with phenotypic evidence integer
60 short_variants_with_publications "Short variants With publications" Short variants published in literature integer
61 short_variants_frequency_studies "Short variants Frequency studies" Short variants studied for frequency integer
62 structural_variants_with_phenotype_assertions "Structural variants With phenotype assertions" Structural variants with phenotypic evidence integer
63 enhancers Enhancers DNA sequences that increase gene expression integer
64 promoters Promoters DNA sequences initiating transcription integer
65 assembly.accession accession accession string
66 assembly.default default default string
67 assembly.name name name string
68 assembly.ucsc_alias ucsc_alias ucsc_alias string
69 genebuild.last_geneset_update last_geneset_update last_geneset_update string
70 genebuild.version version version string
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,10 @@
2301119 Rhabditomorpha scientific name
2698737 Sar Burki et al. 2008 authority
2698737 Sar scientific name
666668 jabberwocky synonym
666668 carol_jabberwocky3 equivalent name
666668 carol_jabberwocky2 equivalent name
666668 carol_jabberwocky scientific name
2698737 SAR supergroup synonym
38820 4478 merged_taxon_id
38820 4727 merged_taxon_id
Expand Down
32 changes: 28 additions & 4 deletions src/ensembl/production/metadata/updater/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sqlalchemy.engine import make_url

from ensembl.core.models import Meta
from ensembl.production.metadata.api.models import DatasetSource
from ensembl.database import DBConnection
from ensembl.production.metadata.api.models import EnsemblRelease

Expand All @@ -21,17 +22,15 @@ class BaseMetaUpdater:
def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

loaded genomes are not assigned to any release, needed extra step to assign the genomes to release will this be handled in future developments and taxonomy_uri is not used is it required for future purposes?

self.db_uri = db_uri
self.db = DBConnection(self.db_uri)
self.species = None
self.db_type = None
self.metadata_db = DBConnection(metadata_uri)
# We will add a release later. For now, the release must be specified for it to be used.
if release is None:
self.listed_release = None
self.listed_release_is_current = None
else:
self.listed_release = release
self.listed_release_is_current = EnsemblRelease.is_current
self.metadata_db = DBConnection(metadata_uri)
self.taxonomy_uri = taxonomy_uri


# Basic API for the meta table in the submission database.
def get_meta_single_meta_key(self, species_id, parameter):
Expand All @@ -43,4 +42,29 @@ def get_meta_single_meta_key(self, species_id, parameter):
else:
return result[0]

def get_meta_list_from_prefix_meta_key(self, species_id, prefix):
with self.db.session_scope() as session:
query = session.query(Meta.meta_key, Meta.meta_value).filter(
Meta.meta_key.like(f'{prefix}%'),
Meta.species_id == species_id
)
result = query.all()
if not result:
return {}
else:
# Build a dictionary out of the results.
result_dict = {key: value for key, value in result}
return result_dict

def get_or_new_source(self, meta_session, db_uri, db_type):
name = make_url(db_uri).database
dataset_source = meta_session.query(DatasetSource).filter(DatasetSource.name == name).one_or_none()
if dataset_source is None:
dataset_source = DatasetSource(
type=db_type, # core/fungen etc
name=name # dbname
)
meta_session.add(dataset_source) # Only add a new DatasetSource to the session if it doesn't exist
return dataset_source, "new"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Status

Suggested change
return dataset_source, "new"
return dataset_source, "New"

else:
return dataset_source, "existing"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return dataset_source, "existing"
return dataset_source, "Existing"

or setting Enum class will be standard

Loading