Skip to content

Commit

Permalink
Merge pull request #933 from nuno-agostinho/add/ProteinFunctionPredic…
Browse files Browse the repository at this point in the history
…tion-datachecks

Add DataChecks to protein function prediction eHive pipeline
  • Loading branch information
nakib103 committed Jan 19, 2023
2 parents b0c583c + 2ba1b29 commit bfb52ed
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 7 deletions.
24 changes: 24 additions & 0 deletions modules/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/InitJobs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,28 @@ sub fetch_input {

close $FASTA;
}

# prepare old_server_uri to which to compare against
my $old_server_uri = $self->param('old_server_uri');

my $group = 'variation';
my $var_dbc = $self->get_species_adaptor($group)->dbc;
unless (defined $old_server_uri) {
my $user = $var_dbc->user;
my $port = $var_dbc->port;
my $host = $var_dbc->host;
my $species = $self->param('species');
my $release = $self->param('ensembl_release') - 1;
my $assembly = $self->param('assembly');
$old_server_uri ||= sprintf("mysql://%s@%s:%s/%s_%s_%s_%s",
$user, $host, $port,
$species, $group, $release, $assembly);
}

# set up our list of output ids

$self->param('dc_output_ids',
{ 'group' => $group, 'old_server_uri' => [ $old_server_uri ] });
$self->param('pph_output_ids', [ map { {translation_md5 => $_} } @pph_md5s ]);
$self->param('sift_output_ids', [ map { {translation_md5 => $_} } @sift_md5s ]);
$self->param('dbnsfp_output_ids', [ map { {translation_md5 => $_} } @dbnsfp_md5s ]);
Expand Down Expand Up @@ -292,6 +312,10 @@ sub write_output {
$self->dataflow_output_id($self->param('cadd_output_ids'), 5);
}

if ($self->param('run_dc')) {
$self->dataflow_output_id($self->param('dc_output_ids'), 1);
}

}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ use warnings;

use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');

use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf;
use Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::Constants qw(FULL UPDATE NONE);

sub default_options {
Expand All @@ -59,7 +60,7 @@ sub default_options {
hive_no_init => 0,
hive_default_max_retry_count => 0,
hive_debug_init => 1,
debug_mode => 0,
debug_mode => 0,

# the location of your ensembl checkout, the hive looks here for SQL files etc.

Expand All @@ -80,6 +81,30 @@ sub default_options {

ensembl_registry => $self->o('species_dir').'/ensembl.registry',

# a file containing history of datachecks ran potentially used to determine
# if a datacheck can be skipped

history_file => '/nfs/production/flicek/ensembl/production/datachecks/history/vertebrates.json',

# output dir where datacheck result will be stored

dc_outdir => $self->o('pipeline_dir')."/".$self->o('pipeline_name')."_dc_output",

# if set, fails the datacheck pipeline job if the datacheck fails
# can be overwritten when running the pipeline

failures_fatal => 1,

# if set, runs the datachecks analysis jobs
# can be overwritten when running the pipeline

run_dc => 1,

# the uri of the database server which stores the database of previous release
# supported format is mysql://[a_user]@[some_host]:[port_number]/[old_dbname|old_release_number]

old_server_uri => undef,

# peptide sequences for all unique translations for this species will be dumped to this file

fasta_file => $self->o('species_dir').'/'.$self->o('species').'_translations.fa',
Expand Down Expand Up @@ -241,6 +266,8 @@ sub pipeline_analyses {
fasta_file => $self->o('fasta_file'),
ensembl_registry => $self->o('ensembl_registry'),
species => $self->o('species'),
ensembl_release => $self->o('ensembl_release'),
assembly => $self->o('assembly'),
debug_mode => $self->o('debug_mode'),
);

Expand All @@ -265,16 +292,21 @@ sub pipeline_analyses {
bam => $self->o('bam'),
species_dir => $self->o('species_dir'),
use_compara => $self->o('sift_use_compara'),
run_dc => $self->o('run_dc'),
old_server_uri => $self->o('old_server_uri'),
@common_params,
},
-input_ids => [{}],
-rc_name => 'highmem',
-max_retry_count => 0,
-flow_into => {
2 => [ 'run_polyphen' ],
3 => [ 'run_sift' ],
4 => [ 'run_dbnsfp' ],
5 => [ 'run_cadd' ],
'2->A' => [ 'run_polyphen' ],
'3->A' => [ 'run_sift' ],
'4->A' => [ 'run_dbnsfp' ],
'5->A' => [ 'run_cadd' ],
'A->1' => WHEN(
'#run_dc#' => [ 'datacheck' ]
)
},
},

Expand Down Expand Up @@ -309,7 +341,6 @@ sub pipeline_analyses {
-input_ids => [],
-hive_capacity => $self->o('weka_max_workers'),
-rc_name => 'default',
-flow_into => {},
},

{ -logic_name => 'run_sift',
Expand All @@ -328,7 +359,7 @@ sub pipeline_analyses {
-hive_capacity => $self->o('sift_max_workers'),
-rc_name => 'medmem',
-flow_into => {
-1 => ['run_sift_highmem'],
-1 => ['run_sift_highmem']
}
},

Expand Down Expand Up @@ -374,6 +405,27 @@ sub pipeline_analyses {
-rc_name => 'medmem',
},

{ -logic_name => 'datacheck',
-module => 'Bio::EnsEMBL::DataCheck::Pipeline::RunDataChecks',
-parameters => {
datacheck_names => [
'CompareProteinFunctionPredictions',
'ProteinFunctionPredictions'
],
registry_file => $self->o('ensembl_registry'),
history_file => $self->o('history_file'),
output_dir => $self->o('dc_outdir'),
failures_fatal => $self->o('failures_fatal'),
@common_params
},
-input_ids => [], #default
-hive_capacity => 1,
-analysis_capacity => 1,
-rc_name => 'default',
-failed_job_tolerance => 0,
-max_retry_count => 0,
},

];
}

Expand Down

0 comments on commit bfb52ed

Please sign in to comment.