Merge pull request #933 from nuno-agostinho/add/ProteinFunctionPredic…

…tion-datachecks Add DataChecks to protein function prediction eHive pipeline
Ensembl · Jan 19, 2023 · bfb52ed · bfb52ed
2 parents b0c583c + 2ba1b29
commit bfb52ed
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 7 deletions.
diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/InitJobs.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/InitJobs.pm
@@ -220,8 +220,28 @@ sub fetch_input {
 
       close $FASTA;
     }
+
+    # prepare old_server_uri to which to compare against
+    my $old_server_uri = $self->param('old_server_uri');
+
+    my $group   = 'variation';
+    my $var_dbc = $self->get_species_adaptor($group)->dbc;
+    unless (defined $old_server_uri) {
+        my $user     = $var_dbc->user;
+        my $port     = $var_dbc->port;
+        my $host     = $var_dbc->host;
+        my $species  = $self->param('species');
+        my $release  = $self->param('ensembl_release') - 1;
+        my $assembly = $self->param('assembly');
+        $old_server_uri ||= sprintf("mysql://%s@%s:%s/%s_%s_%s_%s",
+                                    $user, $host, $port,
+                                    $species, $group, $release, $assembly);
+    }
+
     # set up our list of output ids
 
+    $self->param('dc_output_ids',
+                 { 'group' => $group, 'old_server_uri' => [ $old_server_uri ] });
     $self->param('pph_output_ids',  [ map { {translation_md5 => $_} } @pph_md5s ]);
     $self->param('sift_output_ids', [ map { {translation_md5 => $_} } @sift_md5s ]);
     $self->param('dbnsfp_output_ids', [ map { {translation_md5 => $_} } @dbnsfp_md5s ]);
@@ -292,6 +312,10 @@ sub write_output {
         $self->dataflow_output_id($self->param('cadd_output_ids'), 5);
     }
 
+    if ($self->param('run_dc')) {
+        $self->dataflow_output_id($self->param('dc_output_ids'), 1);
+    }
+
 }
 
 

diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/ProteinFunction_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/ProteinFunction/ProteinFunction_conf.pm
@@ -34,6 +34,7 @@ use warnings;
 
 use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');
 
+use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf;
 use Bio::EnsEMBL::Variation::Pipeline::ProteinFunction::Constants qw(FULL UPDATE NONE);
 
 sub default_options {
@@ -59,7 +60,7 @@ sub default_options {
         hive_no_init => 0,
         hive_default_max_retry_count => 0,
         hive_debug_init => 1,
-        debug_mode              => 0,
+        debug_mode => 0,
 
         # the location of your ensembl checkout, the hive looks here for SQL files etc.
 
@@ -80,6 +81,30 @@ sub default_options {
 
         ensembl_registry        => $self->o('species_dir').'/ensembl.registry',
 
+        # a file containing history of datachecks ran potentially used to determine
+        # if a datacheck can be skipped 
+
+        history_file            => '/nfs/production/flicek/ensembl/production/datachecks/history/vertebrates.json',
+
+        # output dir where datacheck result will be stored
+
+        dc_outdir               => $self->o('pipeline_dir')."/".$self->o('pipeline_name')."_dc_output",
+
+        # if set, fails the datacheck pipeline job if the datacheck fails
+        # can be overwritten when running the pipeline
+
+        failures_fatal          => 1,
+
+        # if set, runs the datachecks analysis jobs
+        # can be overwritten when running the pipeline
+
+        run_dc                  => 1,
+
+        # the uri of the database server which stores the database of previous release
+        # supported format is mysql://[a_user]@[some_host]:[port_number]/[old_dbname|old_release_number]
+
+        old_server_uri          => undef,
+
         # peptide sequences for all unique translations for this species will be dumped to this file
 
         fasta_file              => $self->o('species_dir').'/'.$self->o('species').'_translations.fa',
@@ -241,6 +266,8 @@ sub pipeline_analyses {
         fasta_file          => $self->o('fasta_file'),
         ensembl_registry    => $self->o('ensembl_registry'),
         species             => $self->o('species'),
+        ensembl_release     => $self->o('ensembl_release'),
+        assembly            => $self->o('assembly'),
         debug_mode          => $self->o('debug_mode'),
     );
 
@@ -265,16 +292,21 @@ sub pipeline_analyses {
                 bam             => $self->o('bam'),
                 species_dir     => $self->o('species_dir'),
                 use_compara     => $self->o('sift_use_compara'),
+                run_dc          => $self->o('run_dc'),
+                old_server_uri  => $self->o('old_server_uri'),
                 @common_params,
             },
             -input_ids  => [{}],
             -rc_name    => 'highmem',
             -max_retry_count => 0,
             -flow_into  => {
-                2 => [ 'run_polyphen' ],
-                3 => [ 'run_sift' ],
-                4 => [ 'run_dbnsfp' ],
-                5 => [ 'run_cadd' ],
+                '2->A' => [ 'run_polyphen' ],
+                '3->A' => [ 'run_sift' ],
+                '4->A' => [ 'run_dbnsfp' ],
+                '5->A' => [ 'run_cadd' ],
+                'A->1' => WHEN(
+                    '#run_dc#' => [ 'datacheck' ]
+                )
             },
         },
 
@@ -309,7 +341,6 @@ sub pipeline_analyses {
             -input_ids      => [],
             -hive_capacity  => $self->o('weka_max_workers'),
             -rc_name        => 'default',
-            -flow_into      => {},
         },
 
         {   -logic_name     => 'run_sift',
@@ -328,7 +359,7 @@ sub pipeline_analyses {
             -hive_capacity  => $self->o('sift_max_workers'),
             -rc_name        => 'medmem',
             -flow_into      => {
-              -1 => ['run_sift_highmem'],
+                -1 => ['run_sift_highmem']
             }
         },
 
@@ -374,6 +405,27 @@ sub pipeline_analyses {
             -rc_name        => 'medmem',
         },
 
+        {   -logic_name      => 'datacheck',
+            -module          => 'Bio::EnsEMBL::DataCheck::Pipeline::RunDataChecks',
+            -parameters      => {
+                datacheck_names => [
+                    'CompareProteinFunctionPredictions',
+                    'ProteinFunctionPredictions'
+                ],
+                registry_file  => $self->o('ensembl_registry'),
+                history_file   => $self->o('history_file'),
+                output_dir     => $self->o('dc_outdir'),
+                failures_fatal => $self->o('failures_fatal'),
+                @common_params
+            },            
+            -input_ids            => [], #default
+            -hive_capacity        => 1,
+            -analysis_capacity    => 1,
+            -rc_name              => 'default',
+            -failed_job_tolerance => 0,
+            -max_retry_count      => 0,
+        },
+
     ];
 }