Skip to content

Commit

Permalink
Merge pull request #283 from CristiGuijarro/fix/compara_fks
Browse files Browse the repository at this point in the history
Remove duplicate fk tests and fix constraint issues
  • Loading branch information
james-monkeyshines committed Aug 3, 2020
2 parents e989e1a + 0105f52 commit 3a4cd63
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 72 deletions.
44 changes: 41 additions & 3 deletions lib/Bio/EnsEMBL/DataCheck/Checks/CheckConstrainedElementTable.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use strict;
use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;
use Bio::EnsEMBL::Utils::SqlHelper;

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

Expand All @@ -33,17 +34,54 @@ use constant {
GROUPS => ['compara', 'compara_genome_alignments'],
DATACHECK_TYPE => 'critical',
DB_TYPES => ['compara'],
TABLES => ['constrained_elements']
TABLES => ['constrained_elements', 'method_link_species_set']
};

sub skip_tests {
my ($self) = @_;
my $mlss_adap = $self->dba->get_MethodLinkSpeciesSetAdaptor;
my $mlss = $mlss_adap->fetch_all_by_method_link_type('GERP_CONSTRAINED_ELEMENT');
my $db_name = $self->dba->dbc->dbname;

if ( scalar(@$mlss) == 0 ) {
return( 1, "There are no GERP_CONSTRAINED_ELEMENT MLSS in $db_name" );
}
}

sub tests {
my ($self) = @_;
my $dba = $self->dba;

my $helper = $dba->dbc->sql_helper;

my $desc = "All the rows in constrained_element have a one-to-many relationship for constrained_element_id";

is_one_to_many($dba->dbc, "constrained_element", "constrained_element_id", $desc);


my $mlsss = $self->dba->get_MethodLinkSpeciesSetAdaptor->fetch_all_by_method_link_type('GERP_CONSTRAINED_ELEMENT');

my $expected_ce_count;

foreach my $mlss ( @$mlsss ) {

my $mlss_id = $mlss->dbID;
my $mlss_name = $mlss->name;

my $sql = qq/
SELECT COUNT(*)
FROM constrained_element
WHERE method_link_species_set_id = $mlss_id
/;

$expected_ce_count += $helper->execute_single_result(-SQL => $sql);

my $desc_1 = "The constrained elements for $mlss_id ($mlss_name) are present as expected";
is_rows_nonzero($dba, $sql, $desc_1);
}

my $desc_2 = "All the constrained elements with corresponding method_link_species_sets are expected";
my $row_count_sql = "SELECT COUNT(*) FROM constrained_element";
is_rows($dba, $row_count_sql, $expected_ce_count, $desc_2);

}

1;
Expand Down
98 changes: 98 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Checks/CheckGenomicAlignments.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
=head1 LICENSE
Copyright [2018-2020] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the 'License');
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an 'AS IS' BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut

package Bio::EnsEMBL::DataCheck::Checks::CheckGenomicAlignments;

use warnings;
use strict;

use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;
use Bio::EnsEMBL::Utils::SqlHelper;

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'CheckGenomicAlignments',
DESCRIPTION => 'The expected number of genomic alignments have been merged',
GROUPS => ['compara', 'compara_genome_alignments'],
DATACHECK_TYPE => 'critical',
DB_TYPES => ['compara'],
TABLES => ['method_link_species_set', 'genomic_align', 'genomic_align_block']
};

sub skip_tests {
my ($self) = @_;
my $mlss_adap = $self->dba->get_MethodLinkSpeciesSetAdaptor;

my @method_links = qw(LASTZ_NET LASTZ_PATCH EPO EPO_EXTENDED PECAN);
my @mlsss;
foreach my $method (@method_links) {
my $mlss = $mlss_adap->fetch_all_by_method_link_type($method);
push @mlsss, @$mlss;
}

my $db_name = $self->dba->dbc->dbname;

if ( scalar(@mlsss) == 0 ) {
return( 1, "There are no genomic alignment MLSS in $db_name" );
}
}

sub tests {
my ($self) = @_;
my $dba = $self->dba;
my $helper = $dba->dbc->sql_helper;
my @method_links = qw(LASTZ_NET LASTZ_PATCH EPO EPO_EXTENDED PECAN);

my $expected_align_count;
my @tables = qw(genomic_align genomic_align_block);

foreach my $table (@tables) {
foreach my $method_link_type ( @method_links ) {

my $mlsss = $self->dba->get_MethodLinkSpeciesSetAdaptor->fetch_all_by_method_link_type($method_link_type);
# Only check from the method_links that have mlsss there are other datachecks to check if mlsss are correct
next if scalar(@$mlsss) == 0;

foreach my $mlss ( @$mlsss ) {

my $mlss_id = $mlss->dbID;
my $mlss_name = $mlss->name;

my $sql = qq/
SELECT COUNT(*)
FROM $table
WHERE method_link_species_set_id = $mlss_id
/;

$expected_align_count += $helper->execute_single_result(-SQL => $sql) if $table eq "genomic_align";

my $desc_1 = "The $table for $mlss_id ($mlss_name) has rows as expected";
is_rows_nonzero($dba, $sql, $desc_1);
}
}
}
# Check that all the genomic_aligns correspond to a method_link_species_set that should have an alignment
my $desc_2 = "All the genomic_align rows with corresponding method_link_species_sets are expected";
my $row_count_sql = "SELECT COUNT(*) FROM genomic_align";
is_rows($dba, $row_count_sql, $expected_align_count, $desc_2);
}

1;
20 changes: 19 additions & 1 deletion lib/Bio/EnsEMBL/DataCheck/Checks/CheckSynteny.pm
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'CheckSynteny',
DESCRIPTION => 'Every synteny_region_id should be seen more than once',
DESCRIPTION => 'Every synteny_region_id should be seen more than once and correspond to an mlss',
GROUPS => ['compara', 'compara_syntenies'],
DATACHECK_TYPE => 'critical',
DB_TYPES => ['compara'],
Expand Down Expand Up @@ -64,6 +64,24 @@ sub tests {

my $desc_2 = "All synteny_region_ids have been seen more than once";
is_one_to_many( $dbc, "dnafrag_region", "synteny_region_id", $desc_2 );

my $mlss_adap = $self->dba->get_MethodLinkSpeciesSetAdaptor;
my $mlsss = $mlss_adap->fetch_all_by_method_link_type('SYNTENY');

foreach my $mlss ( @$mlsss ) {

my $mlss_id = $mlss->dbID;
my $mlss_name = $mlss->name;

my $sql = qq/
SELECT COUNT(*)
FROM synteny_region
WHERE method_link_species_set_id = $mlss_id
/;

my $desc_3 = "The syntenies for $mlss_id ($mlss_name) are present as expected";
is_rows_nonzero($dbc, $sql, $desc_3);
}
}

1;
Expand Down
67 changes: 0 additions & 67 deletions lib/Bio/EnsEMBL/DataCheck/Checks/ForeignKeysCompara.pm
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,9 @@ sub compara_fk {
# in the "table.sql" file.

# Standard FK constraints that are missing from "table.sql".
fk($self->dba, 'species_tree_node', 'parent_id', 'species_tree_node', 'node_id');
fk($self->dba, 'species_tree_node', 'root_id', 'species_tree_node', 'node_id');
fk($self->dba, 'species_tree_node', 'root_id', 'species_tree_root');

fk($self->dba, 'genomic_align_tree', 'parent_id', 'genomic_align_tree', 'node_id');
fk($self->dba, 'genomic_align_tree', 'root_id', 'genomic_align_tree', 'node_id');
fk($self->dba, 'genomic_align_tree', 'left_node_id', 'genomic_align_tree', 'node_id');
fk($self->dba, 'genomic_align_tree', 'right_node_id', 'genomic_align_tree', 'node_id');

# Cases in which we want to check for the reverse direction of the FK constraint
fk($self->dba, 'family', 'family_id', 'family_member');
Expand All @@ -118,68 +113,6 @@ sub compara_fk {
}

# Cases in which we need to restrict to a subset of rows, using a constraint
my $genomic_align_constraint = q/
method_link_id IN (
SELECT method_link_id FROM method_link
WHERE
method_link_id < 100 AND
class LIKE "GenomicAlign%" AND
type NOT LIKE "CACTUS_HAL%"
)
/;
fk($self->dba, 'genomic_align', 'method_link_species_set_id', 'method_link_species_set', 'method_link_species_set_id', $genomic_align_constraint);
fk($self->dba, 'genomic_align_block', 'method_link_species_set_id', 'method_link_species_set', 'method_link_species_set_id', $genomic_align_constraint);

my $constrained_element_constraint = q/
method_link_id IN (
SELECT method_link_id FROM method_link
WHERE
method_link_id < 100 AND
class LIKE "ConstrainedElement.%"
)
/;
fk($self->dba, 'constrained_element', 'method_link_species_set_id', 'method_link_species_set', 'method_link_species_set_id', $constrained_element_constraint);

my $synteny_region_constraint = q/
method_link_id IN (
SELECT method_link_id FROM method_link
WHERE
method_link_id > 100 AND
method_link_id < 200
)
/;
fk($self->dba, 'synteny_region', 'method_link_species_set_id', 'method_link_species_set', 'method_link_species_set_id', $synteny_region_constraint);

my $homology_constraint = q/
method_link_id IN (
SELECT method_link_id FROM method_link
WHERE
method_link_id > 200 AND
method_link_id < 300
)
/;
fk($self->dba, 'homology', 'method_link_species_set_id', 'method_link_species_set', 'method_link_species_set_id', $homology_constraint);

my $family_constraint = q/
method_link_id IN (
SELECT method_link_id FROM method_link
WHERE
method_link_id > 300 AND
method_link_id < 400
)
/;
fk($self->dba, 'family', 'method_link_species_set_id', 'method_link_species_set', 'method_link_species_set_id', $family_constraint);

my $tree_constraint = q/
method_link_id IN (
SELECT method_link_id FROM method_link
WHERE
method_link_id > 400 AND
method_link_id < 500
)
/;
fk($self->dba, 'gene_tree_root', 'method_link_species_set_id', 'method_link_species_set', 'method_link_species_set_id', $tree_constraint);
fk($self->dba, 'species_tree_root', 'method_link_species_set_id', 'method_link_species_set', 'method_link_species_set_id', $tree_constraint);

my $hom_stats_constraint = q/
tree_type = 'tree' AND
Expand Down
12 changes: 11 additions & 1 deletion lib/Bio/EnsEMBL/DataCheck/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,16 @@
"name" : "CheckGenomicAlignTreeTable",
"package_name" : "Bio::EnsEMBL::DataCheck::Checks::CheckGenomicAlignTreeTable"
},
"CheckGenomicAlignments" : {
"datacheck_type" : "critical",
"description" : "The expected number of genomic alignments have been merged",
"groups" : [
"compara",
"compara_genome_alignments"
],
"name" : "CheckGenomicAlignments",
"package_name" : "Bio::EnsEMBL::DataCheck::Checks::CheckGenomicAlignments"
},
"CheckHomology" : {
"datacheck_type" : "critical",
"description" : "Check homology_id are all one-to-many for homology_members",
Expand Down Expand Up @@ -519,7 +529,7 @@
},
"CheckSynteny" : {
"datacheck_type" : "critical",
"description" : "Every synteny_region_id should be seen more than once",
"description" : "Every synteny_region_id should be seen more than once and correspond to an mlss",
"groups" : [
"compara",
"compara_syntenies"
Expand Down

0 comments on commit 3a4cd63

Please sign in to comment.