Skip to content

Commit

Permalink
Merge pull request #400 from Ensembl/release/105
Browse files Browse the repository at this point in the history
Release/105
  • Loading branch information
marcoooo committed Sep 8, 2021
2 parents 713bf05 + 8e9fd09 commit 46896ca
Show file tree
Hide file tree
Showing 23 changed files with 621 additions and 123 deletions.
26 changes: 16 additions & 10 deletions lib/Bio/EnsEMBL/DataCheck/Checks/CheckHomology.pm
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,22 @@ sub tests {
is_one_to_many($dbc, "homology_member", "homology_id", $desc_1);
### Hoping for a better idea than this query (below)
my $hideous_sql = q/
SELECT hm1.gene_member_id gene_member_id1, hm2.gene_member_id gene_member_id2, COUNT(*) num,
GROUP_CONCAT(h1.description order by h1.description) descs
FROM homology h1
CROSS JOIN homology_member hm1
USING (homology_id)
CROSS JOIN homology_member hm2
USING (homology_id)
WHERE hm1.gene_member_id < hm2.gene_member_id
GROUP BY hm1.gene_member_id, hm2.gene_member_id
HAVING COUNT(*) > 1
SELECT
hm1.gene_member_id gene_member_id1,
hm2.gene_member_id gene_member_id2,
COUNT(*) num,
GROUP_CONCAT(h1.description
ORDER BY h1.description) descs
FROM
homology h1
CROSS JOIN
homology_member hm1 USING (homology_id)
CROSS JOIN
homology_member hm2 USING (homology_id)
WHERE
hm1.gene_member_id < hm2.gene_member_id
GROUP BY h1.gene_tree_root_id, hm1.gene_member_id, hm2.gene_member_id
HAVING COUNT(*) > 1
/;

my $desc_2 = "There is no redundancy in homology";
Expand Down
14 changes: 6 additions & 8 deletions lib/Bio/EnsEMBL/DataCheck/Checks/CheckOntologyTerm.pm
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,13 @@ use constant {
sub tests {
my ($self) = @_;

my $desc = 'Critical terms are present in term table';
my @critical_terms = (
"'EFO:0003900'"
);
my $len = @critical_terms;
my @critical_terms = qw/EFO:0003900/;

my $sql = "SELECT COUNT(*) FROM term WHERE accession IN (".join(', ', @critical_terms).")";

is_rows($self->dba, $sql, $len, $desc);
foreach (@critical_terms) {
my $desc = "Critical term $_ is present in term table";
my $sql = "SELECT accession FROM term WHERE accession = '$_'";
is_rows($self->dba, $sql, 1, $desc);
}
}

1;
Expand Down
58 changes: 25 additions & 33 deletions lib/Bio/EnsEMBL/DataCheck/Checks/ChromosomesAnnotated.pm
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use strict;

use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;
use Bio::EnsEMBL::DataCheck::Utils qw/sql_count/;

extends 'Bio::EnsEMBL::DataCheck::DbCheck';
Expand All @@ -35,7 +36,7 @@ use constant {
TABLES => ['attrib_type', 'coord_system', 'seq_region', 'seq_region_attrib']
};

sub skip_tests {
sub tests {
my ($self) = @_;

my $sa = $self->dba->get_adaptor('Slice');
Expand All @@ -45,53 +46,44 @@ sub skip_tests {

my @chromosomal = ('chromosome', 'chromosome_group', 'plasmid');

my $chr_count = 0;
foreach my $cs_name (@chromosomal) {
my $slices = $sa->fetch_all($cs_name, $cs_version);
foreach (@$slices) {
# seq_regions that are not genuine biological chromosomes,
# but are instead collections of unmapped sequence,
# have a 'chromosome' attribute - these regions do not
# necessarily need a karyotype_rank attribute.
my @non_bio_chr = @{$_->get_all_Attributes('chromosome')};
if (! scalar(@non_bio_chr)) {
$chr_count++;
}
}
}

if ( $chr_count <= 1 ) {
return (1, 'Zero or one chromosomal seq_regions.');
}
}

sub tests {
my ($self) = @_;

my $sa = $self->dba->get_adaptor('Slice');

my $mca = $self->dba->get_adaptor('MetaContainer');
my $cs_version = $mca->single_value_by_key('assembly.default');

my @chromosomal = ('chromosome', 'chromosome_group', 'plasmid');

foreach my $cs_name (@chromosomal) {
my $slices = $sa->fetch_all($cs_name, $cs_version);
foreach (@$slices) {
my @non_bio_chr = @{$_->get_all_Attributes('chromosome')};
next if scalar(@non_bio_chr);

my $sr_name = $_->seq_region_name;
my $desc = "$cs_name $sr_name has 'karyotype_rank' attribute";
ok($_->has_karyotype, $desc);

if ($sr_name =~ /^(chrM|chrMT|MT|Mito|mitochondrion_genome)$/) {
my $desc_mt = "$cs_name $sr_name has mitochondrial 'sequence_location' attribute";
my %seq_locs = map { $_->value => 1 } @{$_->get_all_Attributes('sequence_location')};
ok(exists $seq_locs{'mitochondrial_chromosome'}, $desc_mt);
}
}
}

$self->karyotype_rank_cardinality();
}

sub karyotype_rank_cardinality {
my ($self) = @_;

# This is a separate check because 'primary_assembly' regions
# need to be tested, as well those marked as 'chromosomes'.
my $desc = "Regions have only one 'karyotype_rank' attribute";
my $diag = "Regions with multiple 'karyotype_rank' attributes";
my $sql = q/
SELECT seq_region_id, COUNT(*) FROM
seq_region_attrib sra INNER JOIN
attrib_type at USING (attrib_type_id)
WHERE
at.code = 'karyotype_rank'
GROUP BY
sra.seq_region_id
HAVING COUNT(*) > 1;
/;

is_rows_zero($self->dba, $sql, $desc, $diag);
}

1;
12 changes: 12 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Checks/CompareGOXref.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use strict;
use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;
use Bio::EnsEMBL::DataCheck::Utils qw(same_assembly same_geneset);

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

Expand All @@ -42,6 +43,17 @@ sub tests {
my $old_dba = $self->get_old_dba();

skip 'No old version of database', 1 unless defined $old_dba;

my $mca = $self->dba->get_adaptor('MetaContainer');
my $old_mca = $old_dba->get_adaptor('MetaContainer');

if (!same_assembly($mca, $old_mca)) {
skip 'Current DB has different assembly', 1;
}

if (!same_geneset($mca, $old_mca)) {
skip 'Current DB has different geneset', 1;
}

$self->go_xref_counts($old_dba);
}
Expand Down
4 changes: 2 additions & 2 deletions lib/Bio/EnsEMBL/DataCheck/Checks/CompareOntologyTerm.pm
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'CompareOntologyTerm',
DESCRIPTION => 'Compare Term counts between current and previous ontology database',
DESCRIPTION => 'Compare namespace counts between current and previous ontology database',
GROUPS => ['ontologies'],
DATACHECK_TYPE => 'critical',
DATACHECK_TYPE => 'advisory',
DB_TYPES => ['ontology'],
TABLES => ['ontology', 'term']
};
Expand Down
58 changes: 58 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Checks/CompareOntologyTotal.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
=head1 LICENSE
Copyright [2018-2021] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the 'License');
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an 'AS IS' BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut

package Bio::EnsEMBL::DataCheck::Checks::CompareOntologyTotal;

use warnings;
use strict;

use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'CompareOntologyTotal',
DESCRIPTION => 'Compare total counts between current and previous ontology database',
GROUPS => ['ontologies'],
DATACHECK_TYPE => 'critical',
DB_TYPES => ['ontology'],
TABLES => ['ontology', 'term']
};

sub tests {
my ($self) = @_;

# Inherited code from DbCheck will always fail if the previous
# release's database cannot be found - so don't need to test
# for that here.
my $old_dba = $self->get_old_dba();

my $desc = 'Ontology term totals have not decreased in '.
$self->dba->dbc->dbname.' compared to '.$old_dba->dbc->dbname;
my $sql = q/
SELECT ontology.name, COUNT(*) FROM
term INNER JOIN
ontology USING (ontology_id)
GROUP BY ontology.name
/;
row_subtotals($self->dba, $old_dba, $sql, undef, 1.00, $desc);
}

1;
12 changes: 12 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Checks/CompareProjectedGOXrefs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use strict;
use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;
use Bio::EnsEMBL::DataCheck::Utils qw(same_assembly same_geneset);

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

Expand All @@ -44,6 +45,17 @@ sub tests {

skip 'No old version of database', 1 unless defined $old_dba;

my $mca = $self->dba->get_adaptor('MetaContainer');
my $old_mca = $old_dba->get_adaptor('MetaContainer');

if (!same_assembly($mca, $old_mca)) {
skip 'Current DB has different assembly', 1;
}

if (!same_geneset($mca, $old_mca)) {
skip 'Current DB has different geneset', 1;
}

$self->go_xref_counts($old_dba);
}
}
Expand Down
12 changes: 12 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Checks/CompareProjectedGeneNames.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use strict;
use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;
use Bio::EnsEMBL::DataCheck::Utils qw(same_assembly same_geneset);

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

Expand All @@ -44,6 +45,17 @@ sub tests {

skip 'No old version of database', 1 unless defined $old_dba;

my $mca = $self->dba->get_adaptor('MetaContainer');
my $old_mca = $old_dba->get_adaptor('MetaContainer');

if (!same_assembly($mca, $old_mca)) {
skip 'Current DB has different assembly', 1;
}

if (!same_geneset($mca, $old_mca)) {
skip 'Current DB has different geneset', 1;
}

$self->projected_gene_name_counts($old_dba);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ sub taxonomy_tables {
my ($self, $helper, $tables) = @_;

my $desc_1 = "Taxonomy database found";
my $taxonomy_dba = ($self->registry->alias_exists('multi')) ? $self->get_dba('multi', 'taxonomy') : $self->get_dba('ncbi_taxonomy', 'taxonomy');
my $taxonomy_dba = $self->get_dba('multi', 'taxonomy') ? $self->get_dba('multi', 'taxonomy') : $self->get_dba('ncbi_taxonomy', 'taxonomy');

if (ok(defined $taxonomy_dba, $desc_1)) {
my $taxonomy_helper = $taxonomy_dba->dbc->sql_helper;
Expand Down
50 changes: 50 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Checks/DisplayNameFormat.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
=head1 LICENSE
Copyright [2018-2021] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the 'License');
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an 'AS IS' BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut

package Bio::EnsEMBL::DataCheck::Checks::DisplayNameFormat;

use warnings;
use strict;

use Moose;
use Test::More;

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'DisplayNameFormat',
DESCRIPTION => 'For Rapid Release, the display name must be a specific format',
GROUPS => ['rapid_release'],
DB_TYPES => ['core'],
TABLES => ['meta']
};

sub tests {
my ($self) = @_;

my $mca = $self->dba->get_adaptor("MetaContainer");

# Check that the format of the display name conforms to expectations.
my $format = '[A-Za-z0-9 ]+ \([A-Za-z0-9 ]+\) \- GCA_\d+\.\d+';

my $desc = "Display name has correct format";
my $display_name = $mca->single_value_by_key('species.display_name');
like($display_name, qr/^$format$/, $desc);
}

1;
3 changes: 2 additions & 1 deletion lib/Bio/EnsEMBL/DataCheck/Checks/GeneBounds.pm
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ sub bounds_check {
t.seq_region_start = 0 OR
t.seq_region_end > sr.length
) AND
cs.species_id = $species_id
cs.species_id = $species_id AND
at.code IS NULL
/;
is_rows_zero($self->dba, $sql, $desc, $diag);
}
Expand Down
8 changes: 5 additions & 3 deletions lib/Bio/EnsEMBL/DataCheck/Checks/Karyotype.pm
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,17 @@ sub tests {

my $sa = $self->dba->get_adaptor('Slice');

my $cs_name = 'chromosome';

my $slices = $sa->fetch_all($cs_name, undef, undef, 1);
my $slices = $sa->fetch_all('toplevel', undef, undef, 1);
foreach my $slice (@$slices) {
next unless $slice->karyotype_rank;

my $sr_name = $slice->seq_region_name;
next if $sr_name eq 'MT';

my $bands = $slice->get_all_KaryotypeBands;

my $cs_name = $slice->coord_system_name;

my $desc_1 = "$cs_name $sr_name has karyotype bands";
ok(scalar(@$bands), $desc_1);

Expand Down

0 comments on commit 46896ca

Please sign in to comment.