Skip to content

Commit

Permalink
Merge c7e5504 into f3af6c6
Browse files Browse the repository at this point in the history
  • Loading branch information
james-monkeyshines committed Jan 29, 2019
2 parents f3af6c6 + c7e5504 commit a69b0d6
Show file tree
Hide file tree
Showing 13 changed files with 457 additions and 39 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ before_install:
- git clone --branch master --depth 1 https://github.com/Ensembl/ensembl.git
- git clone --branch master --depth 1 https://github.com/Ensembl/ensembl-funcgen.git
- git clone --branch version/2.5 --depth 1 https://github.com/Ensembl/ensembl-hive.git
- git clone --branch master --depth 1 https://github.com/Ensembl/ensembl-metadata.git
- git clone --branch master --depth 1 https://github.com/Ensembl/ensembl-test.git
- git clone --branch master --depth 1 https://github.com/Ensembl/ensembl-variation.git
- wget https://github.com/bioperl/bioperl-live/archive/release-1-6-924.zip
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.
=cut

package Bio::EnsEMBL::DataCheck::Checks::CompareBiotype;
package Bio::EnsEMBL::DataCheck::Checks::CompareBiotypeGroup;

use warnings;
use strict;
Expand All @@ -28,7 +28,7 @@ use Bio::EnsEMBL::DataCheck::Test::DataCheck;
extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'CompareBiotype',
NAME => 'CompareBiotypeGroup',
DESCRIPTION => 'Check for more than 25% difference between the number of genes '.
'in two databases, broken down by biotype.',
GROUPS => ['core_compare'],
Expand All @@ -45,21 +45,19 @@ sub tests {

skip 'No old version of database', 1 unless defined $old_dba;

diag('Comparing '.$self->dba->dbc->dbname.' and '.$old_dba->dbc->dbname);
diag('Species '.$self->species.', '.$self->dba->species);
my $desc = 'Consistent gene counts';
my $desc = 'Consistent gene counts between '.
$self->dba->dbc->dbname.' and '.$old_dba->dbc->dbname;
my $sql = q/
SELECT biotype, COUNT(*) FROM
gene INNER JOIN
SELECT biotype_group, COUNT(*) FROM
biotype INNER JOIN
gene ON biotype.name = gene.biotype INNER JOIN
seq_region USING (seq_region_id) INNER JOIN
coord_system USING (coord_system_id)
WHERE species_id = %d
GROUP BY biotype
GROUP BY biotype_group
/;
my $sql1 = sprintf($sql, $self->dba->species_id);
my $sql2 = sprintf($sql, $old_dba->species_id);
diag($sql1);
diag($sql2);
row_subtotals($self->dba, $old_dba, $sql1, $sql2, 0.75, $desc);
}
}
Expand Down
82 changes: 57 additions & 25 deletions lib/Bio/EnsEMBL/DataCheck/DbCheck.pm
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ sub get_old_dba {
my $self = shift;
my ($species, $group) = @_;

$species = $self->species unless defined $species;
$group = $self->dba->group unless defined $group;

if (!defined $self->old_server_uri) {
die "Old server details must be set as 'old_server_uri' attribute";
}
Expand All @@ -335,10 +338,11 @@ sub get_old_dba {
$db_version = ($mca->schema_version) - 1;
}

if (! exists $params{'-DBNAME'}) {
$species = $self->species unless defined $species;
$group = $self->dba->group unless defined $group;

my $dbh;
if (exists $params{'-DBNAME'}) {
my $message = 'Specified database does not exist';
$dbh = $self->test_db_connection($uri, $params{'-DBNAME'}, $message);
} else {
my $meta_dba = $self->registry->get_DBAdaptor("multi", "metadata");
die "No metadata database found in the registry" unless defined $meta_dba;

Expand All @@ -356,41 +360,66 @@ sub get_old_dba {
my @dbnames = @{$helper->execute_simple(-SQL => $sql, -PARAMS => $params)};

if (scalar(@dbnames) == 1) {
# We need to suffix the species name to comply with uniqueness rules
# (whenvever you create a dba, it adds itself to the registry...)
# This subsequently means that add_species_id functionality doesn't
# work, so we'll need to work out the species_id ourselves.
my $species_id = 'xxx';

$params{'-SPECIES'} = $species.'_old';
$params{'-GROUP'} = $group;
$params{'-DBNAME'} = $dbnames[0];
if ($self->dba->is_multispecies) {
$params{'-SPECIES_ID'} = $species_id;
$params{'-MULTISPECIES_DB'} = 1;
}
} elsif (scalar(@dbnames) == 0) {
warn "No release $db_version $group database for $species";
} else {
$params{'-DBNAME'} = $dbnames[0];
my $message = 'Database in metadata database does not exist';
$dbh = $self->test_db_connection($uri, $params{'-DBNAME'}, $message);
} elsif (scalar(@dbnames) > 1) {
die "Multiple release $db_version $group databases for $species";
}
}

# We allow $old_dba to be undefined if there is no entry in the metadata db;
# $old_dba can be undefined if there is no entry in the metadata db;
# a datacheck could use the undefined-ness to skip tests in this case.
my $old_dba;
if (exists $params{'-DBNAME'}) {
$old_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(%params);
unless (defined $old_dba) {
die "Release $db_version of $species $group database not found";
# We need to suffix '_old' to the species name to comply
# with uniqueness rules, and ensure we can distinguish between the
# two databases in the registry.
$params{'-SPECIES'} = $species.'_old';
$params{'-GROUP'} = $group;

# Because we have added a suffix to the species name,
# the DBAdaptor code can't work out the correct species_id,
# so we need to work out the species_id and pass it explicitly.
my $sql = qq/
SELECT species_id FROM meta
WHERE
meta_key = 'species.production_name' AND
meta_value = '$species'
/;
my $vals = $dbh->selectcol_arrayref($sql);
my $species_id = $vals->[0];
$params{'-SPECIES_ID'} = $species_id;

# We assume that if the new db is multispecies,
# the old one will be too.
if ($self->dba->is_multispecies) {
$params{'-MULTISPECIES_DB'} = 1;
}

$old_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(%params);

push @{$self->dba_list}, $old_dba;
}

return $old_dba;
}

sub test_db_connection {
my $self = shift;
my ($uri, $dbname, $message) = @_;

my $dsn = "DBI:mysql:database=$dbname;host=".$uri->host.";port=".$uri->port;
my $dbh = DBI->connect($dsn, $uri->user, $uri->pass, { PrintError => 0 });

if (! defined $dbh) {
my $err = $DBI::errstr;
die "$message: $dsn\n$err";
}

return $dbh;
}

sub run_datacheck {
my $self = shift;

Expand Down Expand Up @@ -434,7 +463,10 @@ sub run_datacheck {
$self->dba($original_dba);

} else {
my $label = $self->per_db ? 'all species in collection' : $self->species;
my $label = $self->species;
if ($self->per_db && $self->dba->is_multispecies) {
$label = 'all species in collection';
}

subtest $label => sub {
SKIP: {
Expand Down
1 change: 1 addition & 0 deletions lib/Bio/EnsEMBL/DataCheck/Pipeline/EmailSummary.pm
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ sub fetch_input {
my $sql = q/
SELECT dbname, passed, failed, skipped FROM datacheck_results
WHERE submission_job_id = ?
ORDER BY dbname
/;
my $sth = $self->dbc->prepare($sql);
$sth->execute($submission_job_id);
Expand Down
6 changes: 3 additions & 3 deletions lib/Bio/EnsEMBL/DataCheck/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,14 @@
"name" : "ChromosomesAnnotated",
"package_name" : "Bio::EnsEMBL::DataCheck::Checks::ChromosomesAnnotated"
},
"CompareBiotype" : {
"CompareBiotypeGroup" : {
"datacheck_type" : "advisory",
"description" : "Check for more than 25% difference between the number of genes in two databases, broken down by biotype.",
"groups" : [
"core_compare"
],
"name" : "CompareBiotype",
"package_name" : "Bio::EnsEMBL::DataCheck::Checks::CompareBiotype"
"name" : "CompareBiotypeGroup",
"package_name" : "Bio::EnsEMBL::DataCheck::Checks::CompareBiotypeGroup"
},
"CompareSchema" : {
"datacheck_type" : "critical",
Expand Down
140 changes: 140 additions & 0 deletions t/DbCheck_old.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# Copyright [2018-2019] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

use strict;
use warnings;

use Bio::EnsEMBL::DataCheck::DbCheck;
use Bio::EnsEMBL::Test::MultiTestDB;

use FindBin; FindBin::again();
use Path::Tiny;
use Test::Exception;
use Test::More;

use lib "$FindBin::Bin/TestChecks";
use DbCheck_1;
use DbCheck_2;
use DbCheck_3;
use DbCheck_4;
use DbCheck_5;

my $test_db_dir = $FindBin::Bin;
my $dba_type = 'Bio::EnsEMBL::DBSQL::DBAdaptor';

my $species = 'drosophila_melanogaster';
my $db_type = 'core';
my $testdb = Bio::EnsEMBL::Test::MultiTestDB->new($species, $test_db_dir);

my $dba = $testdb->get_DBAdaptor($db_type);

# Note that you cannot, by design, create a DbCheck object; datachecks
# must inherit from it and define mandatory, read-only parameters that
# are specific to that particular datacheck. So there's a limited amount
# of testing that we can do on the base class, the functionality is
# tested on a subclass.

my $module = 'Bio::EnsEMBL::DataCheck::DbCheck';

subtest 'Fetch old DBA', sub {
# Getting a proper 'old' server is a pain,
# we just pretend that the test server is it.
my %conf = %{$$testdb{conf}{$db_type}};
my $driver = $conf{driver};
my $host = $conf{host};
my $port = $conf{port};
my $user = $conf{user};
my $pass = $conf{pass};

my $server_uri = "$driver://$user:$pass\@$host:$port/";

# Need a test metadata db for retrieving the name of
# a previous release's database.
my $multi = Bio::EnsEMBL::Test::MultiTestDB->new('multi', $test_db_dir);
my $metadata_dba = $multi->get_DBAdaptor('metadata');

my $check = TestChecks::DbCheck_1->new(
dba => $dba,
server_uri => $server_uri,
old_server_uri => $server_uri.$dba->dbc->dbname,
);

# The test databases are added to the registry via MultiTestDB; but
# the datacheck code removes them as part of it's standard monkeying
# around, and their names are such that they are not picked up when
# the registry is subsequently loaded. So, we need to pre-load the
# registry, then add the metadata DBA back.
$check->load_registry();
$check->registry->add_DBAdaptor('multi', 'metadata', $metadata_dba);

my $old_dba = $check->get_old_dba();

isa_ok($old_dba, $dba_type, 'Return value of "get_old_dba"');
is($old_dba->species, "${species}_old", 'Species has "_old" suffix');

$check = TestChecks::DbCheck_1->new(
dba => $dba,
server_uri => $server_uri,
);
throws_ok(
sub { $check->get_old_dba },
qr/Old server details must be set/,
'Fail if old_server_uri is not set');

$check = TestChecks::DbCheck_1->new(
dba => $dba,
server_uri => $server_uri,
old_server_uri => $server_uri,
);

throws_ok(
sub { $check->get_old_dba },
qr/No metadata database found in the registry/,
'Fail if metadata database does not exist');

$check = TestChecks::DbCheck_1->new(
dba => $dba,
server_uri => $server_uri,
old_server_uri => $server_uri.'rhubarb_and_custard',
);

throws_ok(
sub { $check->get_old_dba },
qr/Specified database does not exist/,
'Fail if specified database does not exist');

$check = TestChecks::DbCheck_1->new(
dba => $dba,
server_uri => $server_uri,
old_server_uri => $server_uri.'95',
);

$check->load_registry();
$check->registry->add_DBAdaptor('multi', 'metadata', $metadata_dba);

throws_ok(
sub { $check->get_old_dba },
qr/Database in metadata database does not exist/,
'Fail if database from metadata database does not exist (1/2)');

throws_ok(
sub { $check->get_old_dba('strigamia_maritima', 'core') },
qr/Database in metadata database does not exist/,
'Fail if database from metadata database does not exist (2/2)');

$old_dba = $check->get_old_dba('dinanthropoides_nivalis', 'core');
ok(! defined $old_dba, 'undef if no information in metadata database');
};

done_testing();
3 changes: 3 additions & 0 deletions t/MultiTestDB.conf.default
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,8 @@
'funcgen' => 'Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor',
'variation' => 'Bio::EnsEMBL::Variation::DBSQL::DBAdaptor',
},
'multi' => {
'metadata' => 'Bio::EnsEMBL::MetaData::DBSQL::MetaDataDBAdaptor',
},
},
}
1 change: 1 addition & 0 deletions t/test-genome-DBs/multi/metadata/data_release.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1 95 \N 2019-01-01 0
2 changes: 2 additions & 0 deletions t/test-genome-DBs/multi/metadata/genome.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 1 1 1 dmel_r6.17_FB2017_04 1 1 0 1 1 0 1
2 1 2 2 Ensembl Genomes v1.0 1 1 0 1 0 0 1
2 changes: 2 additions & 0 deletions t/test-genome-DBs/multi/metadata/genome_database.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 1 drosophila_melanogaster_core_95_6 1 core
2 2 strigamia_maritima_core_42_95_1 1 core
2 changes: 2 additions & 0 deletions t/test-genome-DBs/multi/metadata/organism.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 7227 0 7227 drosophila_melanogaster Drosophila_melanogaster Drosophila melanogaster Drosophila melanogaster \N \N \N \N
2 126957 0 126957 strigamia_maritima Strigamia_maritima Strigamia maritima Strigamia maritima \N \N \N \N

0 comments on commit a69b0d6

Please sign in to comment.