Skip to content

Commit

Permalink
Merge pull request #255 from Ensembl/TapToJson
Browse files Browse the repository at this point in the history
Added new module DataCheckTapToJson to convert datacheck tap output t…
  • Loading branch information
vinay-ebi committed Jul 22, 2020
2 parents b64c5c3 + d0f9eb2 commit ef95146
Show file tree
Hide file tree
Showing 6 changed files with 259 additions and 95 deletions.
161 changes: 161 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Pipeline/ConvertTapToJson.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
=head1 LICENSE
Copyright [2018-2020] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=head1 NAME
Bio::EnsEMBL::DataCheck::Pipeline::ConvertTapToJson
=head1 DESCRIPTION
Parse one or more TAP-format output files into a single JSON file.
By default, only failures are included in the JSON file, and the
results are indexed by datacheck name. Results for passing datachecks
can be included by setting '-json_passed => 1', and can be indexed by the
species name/database by setting '-json_by_species => 1'.
=cut

package Bio::EnsEMBL::DataCheck::Pipeline::ConvertTapToJson;

use strict;
use warnings;
use feature 'say';

use JSON;
use Path::Tiny;
use TAP::Parser;

use base ('Bio::EnsEMBL::Hive::Process');

sub param_defaults {
my $self = shift;

return {
output_dir => undef,
json_output_file => undef,
json_by_species => 0,
json_passed => 0
};
}

sub fetch_input {
my $self = shift;

# Whether we are parsing a single TAP-format file or a directory
# of them, we need a single json output file. If the output_dir
# is not specified, the output will go to STDOUT - which isn't
# really useful in a pipeline context, but this code can be used
# in standalone mode on the command line as well.
if (
$self->param_is_defined('output_dir') &&
! $self->param_is_defined('json_output_file')
) {
my $filename = 'results';
$filename .= '_passed' if $self->param('json_passed');
$filename .= '_by_species' if $self->param('json_by_species');
$filename .= '.json';
my $output_file = path($self->param('output_dir'), $filename);
$self->param('json_output_file', $output_file->stringify);
}
}

sub run {
my $self = shift;

my $tap = $self->param_required('tap');
my $output_file = $self->param('json_output_file');
my $passed = $self->param('json_passed');
my $by_species = $self->param('json_by_species');

$self->parse_results($tap, $output_file, $by_species, $passed);
}

sub write_output {
my $self = shift;

$self->dataflow_output_id(
{ json_output_file => $self->param('json_output_file') }, 1
);
}

sub parse_results {
my ($self, $tap, $output_file, $by_species, $passed) = @_;

my @tap_files;
if (-d $tap) {
@tap_files = map { $_->stringify } path($tap)->children;
} else {
push @tap_files, $tap;
}

my %results;
my $datacheck;
my $species;
my $test;
my %tests;

foreach my $tap_file (@tap_files) {
my $tap = path($tap_file)->slurp;
my $parser = TAP::Parser->new( { tap => $tap } );

while (my $result = $parser->next) {
if ($result->is_comment) {
if ($result->as_string =~ /^# Subtest: (.+)/) {
$datacheck = $1;
}
} elsif ($result->is_unknown) {
if ($result->as_string =~ /^\s+# Subtest: (.+)/) {
$species = $1;
%tests = ();
} elsif ($result->as_string =~ /^\s{8}((?:not ok|# No tests run).*)/) {
$test = $1;
$tests{$test} = [];
} elsif ($result->as_string =~ /^\s{8}((?:ok|.* # SKIP).*)/ && $passed) {
$test = $1;
$tests{$test} = [];
} elsif ($result->as_string =~ /^\s{8}#\s(\s*.*)/) {
if (defined $test) {
push @{$tests{$test}}, $1;
} else {
warn "Premature diagnostication: diagnostics incomplete ".
"for $species because they cannot be linked to a test";
}
} elsif ($result->as_string =~ /^\s{4}((?:ok|not ok))/) {
my $ok = $1 eq 'ok' ? 1 : 0;
if (!$ok || $passed) {
my %datacheck_tests = %tests;
if ($by_species) {
$results{$species}{$datacheck}{'ok'} = $ok;
$results{$species}{$datacheck}{'tests'} = \%datacheck_tests;
} else {
$results{$datacheck}{$species}{'ok'} = $ok;
$results{$datacheck}{$species}{'tests'} = \%datacheck_tests;
}
}
$test = undef;
}
}
}
}

my $json = JSON->new->canonical->pretty->encode(\%results);

if ($output_file) {
path($output_file)->parent->mkpath;
path($output_file)->spew($json)
} else {
say $json;
}
}

1;
4 changes: 4 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ sub write_output {
report_per_db => $self->param('report_per_db'),
report_all => $self->param('report_all'),

tap_to_json => $self->param('tap_to_json'),
json_passed => $self->param('json_passed'),
json_by_species => $self->param('json_by_species'),

submission_job_id => $self->input_job->dbID,
};
$self->dataflow_output_id($params, 1);
Expand Down
51 changes: 35 additions & 16 deletions lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSummary.pm
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@ sub run {

my $submission_job_id = $self->param('submission_job_id');

my $history_file = $self->param('history_file');
my $output_dir = $self->param('output_dir');
my $tag = $self->param('tag');
my $email = $self->param('email');
my $timestamp = $self->param('timestamp');
my $history_file = $self->param('history_file');
my $output_dir = $self->param('output_dir');
my $json_output_file = $self->param('json_output_file');
my $json_passed = $self->param('json_passed');
my $tag = $self->param('tag');
my $email = $self->param('email');
my $timestamp = $self->param('timestamp');

my $end_timestamp = localtime->cdate;
my $start = Time::Piece->strptime($timestamp,'%a %b %d %H:%M:%S %Y');
Expand Down Expand Up @@ -71,14 +73,16 @@ sub run {
}

my %output = (
databases => \%results,
passed_total => $passed_total,
failed_total => $failed_total,
history_file => $history_file,
output_dir => $output_dir,
tag => $tag,
timestamp => $end_timestamp,
runtime_sec => "$runtime_sec",
databases => \%results,
passed_total => $passed_total,
failed_total => $failed_total,
history_file => $history_file,
output_dir => $output_dir,
json_output_file => $json_output_file,
json_passed => $json_passed,
tag => $tag,
timestamp => $end_timestamp,
runtime_sec => "$runtime_sec",
);

$self->param('output', \%output);
Expand Down Expand Up @@ -138,20 +142,35 @@ sub set_email_parameters {

my $history_file = $output{history_file};
if (defined $history_file) {
$text .= "The datacheck results were stored in a history file: $history_file.\n";
$text .= "The datacheck results were stored in a history file: $history_file\n";
} else {
$text .= "The datacheck results were not stored in a history file.\n";
}

my $output_dir = $output{output_dir};
if (defined $output_dir) {
$text .= "The full output of the datachecks were stored in: $output_dir.\n";
$text .= "The full output of the datachecks were stored in: $output_dir\n";
} else {
$text .= "The full output of the datachecks were not stored.\n";
}

my $json_output_file = $output{json_output_file};
if (defined $json_output_file) {
if ($output{json_passed}) {
$text .= "All results were stored in JSON format: $json_output_file\n";
} else {
$text .= "Failures were stored in JSON format: $json_output_file\n";
}
if (-s $json_output_file < 2e6) {
push @{$self->param('attachments')}, $json_output_file;
} else {
$text .= "(JSON file not attached because it exceeds 2MB limit)";
}
} else {
$text .= "The results were not stored in JSON format.\n";
}

$self->param('text', $text);
}

1;

44 changes: 39 additions & 5 deletions lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ sub default_options {
email => undef,
report_per_db => 0,
report_all => 0,

tap_to_json => 1,
json_passed => 0,
json_by_species => 1,
};
}

Expand Down Expand Up @@ -177,6 +181,10 @@ sub pipeline_analyses {
email => $self->o('email'),
report_per_db => $self->o('report_per_db'),
report_all => $self->o('report_all'),

tap_to_json => $self->o('tap_to_json'),
json_passed => $self->o('json_passed'),
json_by_species => $self->o('json_by_species'),
},
-rc_name => 'default',
-flow_into => {
Expand All @@ -197,7 +205,8 @@ sub pipeline_analyses {
ELSE
['RunDataChecks']
),
'A->1' => ['DataCheckSummary'],
'A->1' => ['DataCheckResults'],

},
-rc_name => 'default',
},
Expand Down Expand Up @@ -294,16 +303,41 @@ sub pipeline_analyses {
},

{
-logic_name => 'DataCheckSummary',
-module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckSummary',
-analysis_capacity => 10,
-logic_name => 'DataCheckResults',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy',
-max_retry_count => 0,
-parameters => {},
-rc_name => 'default',
-flow_into => {
'1' => ['?table_name=result'],
'1' =>
WHEN('#output_dir# && #tap_to_json#' =>
['ConvertTapToJson'],
ELSE
['DataCheckSummary'],
),
},
},

{
-logic_name => 'ConvertTapToJson',
-module => 'Bio::EnsEMBL::DataCheck::Pipeline::ConvertTapToJson',
-analysis_capacity => 10,
-max_retry_count => 0,
-parameters => {
tap => '#output_dir#',
},
-rc_name => 'default',
-flow_into => ['DataCheckSummary'],
},

{
-logic_name => 'DataCheckSummary',
-module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckSummary',
-analysis_capacity => 10,
-max_retry_count => 0,
-rc_name => 'default',
-flow_into => ['?table_name=result'],
},
];
}

Expand Down

0 comments on commit ef95146

Please sign in to comment.