Skip to content

Commit

Permalink
Merge f4f0635 into 5723f6b
Browse files Browse the repository at this point in the history
  • Loading branch information
james-monkeyshines committed Feb 1, 2019
2 parents 5723f6b + f4f0635 commit cc20ebf
Show file tree
Hide file tree
Showing 9 changed files with 423 additions and 113 deletions.
60 changes: 60 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Checks/AnalysisDescription.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
=head1 LICENSE
Copyright [2018-2019] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the 'License');
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an 'AS IS' BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut

package Bio::EnsEMBL::DataCheck::Checks::AnalysisDescription;

use warnings;
use strict;

use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'AnalysisDescription',
DESCRIPTION => 'Check that certain analyses have descriptions',
GROUPS => ['core'],
DB_TYPES => ['core'],
TABLES => ['analysis', 'analysis_description', 'gene', 'prediction_transcript', 'transcript']
};

sub tests {
my ($self) = @_;

my $species_id = $self->dba->species_id;

my @tables = qw/gene transcript prediction_transcript/;
foreach my $table (@tables) {
my $desc = "Analysis descriptions for all ${table}s";
my $sql = qq/
SELECT COUNT(*) FROM
$table LEFT OUTER JOIN
analysis_description ad USING (analysis_id) INNER JOIN
seq_region USING (seq_region_id) INNER JOIN
coord_system USING (coord_system_id)
WHERE
ad.analysis_id IS NULL AND
species_id = $species_id
/;
is_rows_zero($self->dba, $sql, $desc);
}
}

1;
145 changes: 145 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Checks/DataFilesExist.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
=head1 LICENSE
Copyright [2018-2019] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the 'License');
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an 'AS IS' BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut

package Bio::EnsEMBL::DataCheck::Checks::DataFilesExist;

use warnings;
use strict;

use File::Spec::Functions qw/catdir/;
use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'DataFilesExist',
DESCRIPTION => 'Check that data files are defined in the database, and that the files exist',
GROUPS => ['funcgen'],
DB_TYPES => ['funcgen'],
FORCE => 1
};

sub tests {
my ($self) = @_;

$self->alignment_has_bigwig();
$self->segmentation_file_has_bigbed();
$self->data_files_exist();
}

sub alignment_has_bigwig {
my ($self) = @_;

my $desc = 'Peak-calling alignment files are defined';
my $diag = 'Missing BIGWIG file';
my $sql = q/
SELECT
a.alignment_id,
a.name
FROM
alignment a INNER JOIN
peak_calling pc ON (
pc.signal_alignment_id = a.alignment_id OR
pc.control_alignment_id = a.alignment_id
) LEFT OUTER JOIN
(
SELECT data_file_id FROM data_file
WHERE
table_name = 'alignment' AND
file_type = 'BIGWIG'
) df ON a.bigwig_file_id = df.data_file_id
WHERE
df.data_file_id IS NULL
/;
is_rows_zero($self->dba, $sql, $desc, $diag);
}

sub segmentation_file_has_bigbed {
my ($self) = @_;

my $desc = 'Segmentation files are defined';
my $diag = 'Missing BIGBED file';
my $sql = q/
SELECT
sf.segmentation_file_id,
sf.name
FROM
segmentation_file sf INNER JOIN
regulatory_build rb USING (regulatory_build_id) LEFT OUTER JOIN
(
SELECT table_id FROM data_file
WHERE
table_name = 'segmentation_file' AND
file_type = 'BIGBED'
) df ON sf.segmentation_file_id = df.table_id
WHERE
rb.is_current = 1 AND
df.table_id IS NULL
/;
is_rows_zero($self->dba, $sql, $desc, $diag);
}

sub data_files_exist {
my ($self) = @_;

# This path needs to be un-hardcoded from here and put into a config file...
my $data_file_path = '/nfs/panda/ensembl/production/ensemblftp/data_files/';
my $path = $self->species_assembly_path($data_file_path);

my $data_file_sql = q/
SELECT table_name, path FROM data_file
WHERE file_type IN ('BIGWIG', 'BIGBED')
/;
my $helper = $self->dba->dbc->sql_helper;
my $data_files = $helper->execute(-SQL => $data_file_sql);

my %table_names;
my %missing_files;
foreach (@$data_files) {
my $table_name = $_->[0];
$table_names{$table_name}++;

# Don't need to check for undef $file value, db schema doesn't allow it.
my $file = $_->[1];
my $data_file = catdir($path, $file);
if (! -e $data_file) {
push @{$missing_files{$table_name}}, $data_file;
}
}

foreach my $table_name (keys %table_names) {
my $desc = "All $table_name data files exist";
ok(!exists($missing_files{$table_name}), $desc); #||
#diag explain $missing_files{$table_name};
}
}

sub species_assembly_path {
my ($self, $data_file_path) = @_;

my $species = $self->species;
my $core_dba = $self->get_dna_dba;
my $meta = $core_dba->get_MetaContainer;
my $assembly_default = $meta->single_value_by_key('assembly.default');

return catdir($data_file_path, $species, $assembly_default);
}

1;
12 changes: 10 additions & 2 deletions lib/Bio/EnsEMBL/DataCheck/Checks/DisplayableGenes.pm
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,19 @@ use constant {
sub tests {
my ($self) = @_;

my $species_id = $self->dba->species_id;

my $desc_1 = 'All genes have displayable analysis';
my $diag_1 = 'Undisplayed analysis';
my $sql_1 = q/
my $sql_1 = qq/
SELECT analysis.logic_name
FROM gene
INNER JOIN analysis USING (analysis_id)
INNER JOIN analysis_description USING (analysis_id)
INNER JOIN seq_region USING (seq_region_id)
INNER JOIN coord_system USING (coord_system_id)
WHERE analysis_description.displayable = 0
AND coord_system.species_id = $species_id
GROUP BY analysis.logic_name
HAVING COUNT(*) > 1
/;
Expand All @@ -55,12 +60,15 @@ sub tests {

my $desc_2 = 'All genes have associated web_data';
my $diag_2 = 'web_data is not set';
my $sql_2 = q/
my $sql_2 = qq/
SELECT analysis.logic_name
FROM gene
INNER JOIN analysis USING (analysis_id)
INNER JOIN analysis_description USING (analysis_id)
INNER JOIN seq_region USING (seq_region_id)
INNER JOIN coord_system USING (coord_system_id)
WHERE analysis_description.web_data is NULL
AND coord_system.species_id = $species_id
GROUP BY analysis.logic_name
HAVING COUNT(*) > 1
/;
Expand Down
8 changes: 7 additions & 1 deletion lib/Bio/EnsEMBL/DataCheck/Checks/ForeignKeys.pm
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,10 @@ sub core_fk {
fk($self->dba, 'gene', 'gene_id', 'transcript');
fk($self->dba, 'prediction_transcript', 'prediction_transcript_id', 'prediction_exon');
fk($self->dba, 'mapping_session', 'mapping_session_id', 'stable_id_event');
fk($self->dba, 'analysis', 'analysis_id', 'analysis_description');

# I think this one should be enforced, but need to investigate
# downsides, and give people some warning, since a lot of dbs would fail...
#fk($self->dba, 'analysis', 'analysis_id', 'analysis_description');

# Cases in which we need to restrict to a subset of rows, using a constraint
fk($self->dba, 'object_xref', 'ensembl_id', 'gene', 'gene_id', 'ensembl_object_type = "Gene"');
Expand All @@ -114,6 +117,9 @@ sub funcgen_fk {
# Check for incorrect foreign key relationships that are not defined
# in a "foreign_keys.sql" file.

# Cases in which we want to check for the reverse direction of the FK constraint
fk($self->dba, 'read_file', 'read_file_id', 'alignment_read_file');

# Cases in which we need to restrict to a subset of rows, using a constraint
fk($self->dba, 'associated_feature_type', 'table_id', 'external_feature', 'external_feature_id', 'table_name = "external_feature"');
fk($self->dba, 'associated_feature_type', 'table_id', 'regulatory_feature', 'regulatory_feature_id', 'table_name = "regulatory_feature"');
Expand Down
65 changes: 65 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Checks/FuncgenAnalysisDescription.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
=head1 LICENSE
Copyright [2018-2019] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the 'License');
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an 'AS IS' BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut

package Bio::EnsEMBL::DataCheck::Checks::FuncgenAnalysisDescription;

use warnings;
use strict;

use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'FuncgenAnalysisDescription',
DESCRIPTION => 'Check that certain features have descriptions and are displayable',
GROUPS => ['funcgen'],
DB_TYPES => ['funcgen'],
TABLES => ['analysis', 'analysis_description', 'feature_set', 'probe_feature']
};

sub tests {
my ($self) = @_;

my @tables = qw/feature_set probe_feature/;
foreach my $table (@tables) {
my $desc_1 = "Analysis descriptions for all ${table}s";
my $sql_1 = qq/
SELECT COUNT(*) FROM
$table LEFT OUTER JOIN
analysis_description ad USING (analysis_id)
WHERE
ad.analysis_id IS NULL
/;
is_rows_zero($self->dba, $sql_1, $desc_1);

my $desc_2 = "Displayable analysis for all ${table}s";
my $sql_2 = qq/
SELECT COUNT(*) FROM
$table INNER JOIN
analysis_description ad USING (analysis_id)
WHERE
ad.analysis_id.displayable = 0
/;
is_rows_zero($self->dba, $sql_2, $desc_2);
}
}

1;

0 comments on commit cc20ebf

Please sign in to comment.