Skip to content

Commit

Permalink
Merge pull request #67 from Ensembl/feature/big_red_button
Browse files Browse the repository at this point in the history
Add a beekeeper option for attempting to stop the whole pipeline
  • Loading branch information
ens-bwalts committed Feb 14, 2019
2 parents 2e4dfdd + d835d14 commit c89a39a
Show file tree
Hide file tree
Showing 7 changed files with 231 additions and 3 deletions.
22 changes: 22 additions & 0 deletions modules/Bio/EnsEMBL/Hive/Beekeeper.pm
Original file line number Diff line number Diff line change
Expand Up @@ -187,5 +187,27 @@ sub check_if_blocked {
return $self->is_blocked;
}


=head2 toString
Example : print $beekeeper->toString();
Description : Produces a string summary of properties of this beekeeper.
Returntype : String
Exceptions : none
Caller : general
Status : Stable
=cut

sub toString {
my ( $self ) = @_;

return join( ', ',
'process=' . $self->meadow_user() . '@' . $self->meadow_host() . '#' . $self->process_id(),
"options='" . $self->options() . "'",
);
}


1;

29 changes: 29 additions & 0 deletions modules/Bio/EnsEMBL/Hive/DBSQL/BeekeeperAdaptor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,34 @@ sub reload_beekeeper_is_blocked {
}


=head2 block_all_alive_beekeepers
Example : $bk_adaptor->block_all_alive_beekeepers();
Description : Set is_blocked for all beekeepers known to the
pipeline which haven't died yet. Part of the "shut
everything down" feature - as eHive stands we cannot
tell other beekeepers to kill their respective active
workers (unless said workers happen to belong to the
same meadow, in which case we can essentially hijack
them) but at least we can prevent them from spawning
new workers.
Returntype : none
Exception : none
Caller : beekeeper.pl
Status : Stable
=cut

sub block_all_alive_beekeepers {
my ( $self ) = @_;

my $statement = 'UPDATE beekeeper SET is_blocked = 1 WHERE cause_of_death IS NULL';
my $sth = $self->dbc()->prepare( $statement );
$sth->execute();
$sth->finish();

return;
}


1;
3 changes: 2 additions & 1 deletion modules/Bio/EnsEMBL/Hive/Meadow/LOCAL.pm
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ sub check_worker_is_alive_and_mine {
sub kill_worker {
my ($self, $worker, $fast) = @_;

system('kill', '-9', $worker->process_id());
my $exec_status = system('kill', '-9', $worker->process_id());
return ( $exec_status >> 8 );
}


Expand Down
7 changes: 5 additions & 2 deletions modules/Bio/EnsEMBL/Hive/Meadow/LSF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,14 @@ sub check_worker_is_alive_and_mine {
sub kill_worker {
my ($self, $worker, $fast) = @_;

my $exec_status;
if ($fast) {
system('bkill', '-r', $worker->process_id());
$exec_status = system('bkill', '-r', $worker->process_id());
} else {
system('bkill', $worker->process_id());
$exec_status = system('bkill', $worker->process_id());
}

return ( $exec_status >> 8 );
}


Expand Down
39 changes: 39 additions & 0 deletions modules/Bio/EnsEMBL/Hive/Queen.pm
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,45 @@ sub register_worker_death {
}


sub kill_all_workers {
my ( $self, $valley ) = @_;

my $all_workers_considered_alive = $self->fetch_all( "status!='DEAD'" );
foreach my $worker ( @{ $all_workers_considered_alive } ) {
my $kill_status;

my $meadow = $valley->find_available_meadow_responsible_for_worker( $worker );
if ( ! defined $meadow ) {
# Most likely a meadow not reachable for the current beekeeper,
# e.g. a LOCAL one started on a different host.
$kill_status = 'meadow not reachable';
}
elsif ( ! $meadow->can('kill_worker') ) {
$kill_status = 'killing workers not supported by the meadow';
}
else {
# The actual termination of a worker might well be asynchronous
# but at least we check for obvious problems, e.g. insufficient
# permissions to execute a kill.
my $kill_return_value = $meadow->kill_worker( $worker, 1 );
if ( $kill_return_value != 0 ) {
$kill_status = "request failure (return code: ${kill_return_value})";
}
else {
$kill_status = 'requested successfully';
$worker->cause_of_death( 'KILLED_BY_USER' );
$self->register_worker_death( $worker );
}
}

print 'Killing worker ' . $worker->dbID() . ': '
. $worker->toString( 1 ) . "\n";
}

return;
}


sub cached_resource_mapping {
my $self = shift;
$self->{'_cached_resource_mapping'} ||= { map { $_->dbID => $_->name } $self->db->hive_pipeline->collection_of('ResourceClass')->list };
Expand Down
53 changes: 53 additions & 0 deletions scripts/beekeeper.pl
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ sub main {
my $job_id_for_output = 0;
my $show_worker_stats = 0;
my $kill_worker_id = 0;
my $big_red_button = 0;
my $keep_alive = 0; # DEPRECATED
my $reset_job_id = 0;
my $reset_all_jobs_for_analysis = 0; # DEPRECATED
Expand Down Expand Up @@ -142,6 +143,7 @@ sub main {
'dead!' => \$check_for_dead,
'unkwn!' => \$bury_unkwn_workers,
'killworker=i' => \$kill_worker_id,
'big_red_button' => \$big_red_button,
'alldead!' => \$all_dead,
'balance_semaphores'=> \$balance_semaphores,
'worker_stats' => \$show_worker_stats,
Expand Down Expand Up @@ -340,6 +342,10 @@ sub main {
}
}

if ( $big_red_button ) {
return big_red_button( $self, $valley );
}

my $run_job;
if($run_job_id) {
eval {$run_job = $self->{'dba'}->get_AnalysisJobAdaptor->fetch_by_dbID( $run_job_id ) or die};
Expand Down Expand Up @@ -507,6 +513,49 @@ sub register_beekeeper {
return $beekeeper;
}


sub big_red_button {
my ( $self, $valley ) = @_;

my $bk_a = $self->{dba}->get_BeekeeperAdaptor();
my $blocked_beekeepers;

# Save a list of IDs of beekeepers which were blocked earlier so
# that we can not mention them while reporting the current blocking.
$blocked_beekeepers = $bk_a->fetch_all( 'is_blocked = 1' );
my %previously_blocked_ids;
while ( my $blocked_bk = shift @{ $blocked_beekeepers } ) {
$previously_blocked_ids{ $blocked_bk->dbID() } = 1;
}

# Begin the shutdown by blocking all registered beekeepers so that
# none of them start spawning new workers just as this one tries to
# kill all workers.
$bk_a->block_all_alive_beekeepers();

# Report which beekeepers, self excluded, we have just blocked
$blocked_beekeepers = $bk_a->fetch_all( 'is_blocked = 1' );
my $my_dbid = $self->{'beekeeper'}->dbID();
my @newly_blocked = grep {
( ! exists $previously_blocked_ids{ $_->dbID() } )
&& ( $_->dbID() != $my_dbid )
} @{ $blocked_beekeepers };
while ( my $blocked_bk = shift @newly_blocked ) {
print 'Blocked beekeeper ' . $blocked_bk->dbID() . ': '
. $blocked_bk->toString() . "\n";
}

# Next, kill all workers which are still alive.
# FIXME: double-check correct job status:
# - running ones should be marked as 'failed'
# - claimed but unstarted ones should get back to 'unclaimed'
my $queen = $self->{'dba'}->get_Queen();
$queen->kill_all_workers( $valley );

return 0;
}


sub run_autonomously {
my ($self, $pipeline, $max_loops, $loop_until, $valley, $list_of_analyses, $analyses_pattern, $run_job_id) = @_;

Expand Down Expand Up @@ -973,6 +1022,10 @@ =head2 Other commands/options
detect all workers in UNKWN state and reset their Jobs for resubmission (careful, they *may* reincarnate!)
=item --big_red_button
shut everything down: block all beekeepers connected to the pipeline and terminate workers
=item --alldead
tell the database all workers are dead (no checks are performed in this mode, so be very careful!)
Expand Down
81 changes: 81 additions & 0 deletions t/03.scripts/beekeeper_big_red_button.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env perl

# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

package BeekeeperBigRedButtonTests;

use strict;
use warnings;

use English qw( -no_match_vars );
use Test::More;

use Bio::EnsEMBL::Hive::Utils::Test
qw( init_pipeline runWorker beekeeper get_test_url_or_die run_sql_on_db );


# eHive needs this to initialize the pipeline (and run db_cmd.pl)
$ENV{'EHIVE_ROOT_DIR'} //=
File::Basename::dirname( File::Basename::dirname(
File::Basename::dirname( Cwd::realpath($PROGRAM_NAME) )
) );

my $pipeline_url = get_test_url_or_die();

init_pipeline(
'Bio::EnsEMBL::Hive::Examples::Factories::PipeConfig::LongWorker_conf',
$pipeline_url );

my $hive_dba =
Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new( -url => $pipeline_url );

# Check that the -big_red_button is recognised. Of course if it is it
# will trigger the shutdown - but all it does at this point is
# Beekeeper blocking itself, which has no effect because it doesn't
# actually try to run anything.
beekeeper( $pipeline_url, [ '-big_red_button' ], "beekeper.pl recognises option '-big_red_button'" );

# This will both spawn a worker to claim a job and register another
# beekeeper with the pipeline. Ideally we would run this one in loop
# mode so that we can confirm blocking works, then again having it run
# in the background so that the test suite can continue would be a bit
# messy and given it is quicker to block *all* beekeepers than just
# the active ones, a single-shot run doesn't make that much of a
# difference.
beekeeper( $pipeline_url, [ '-run' ] );
# Give the worker(s) some time to start
sleep(10);

# Now trigger the shutdown for real
beekeeper( $pipeline_url, [ '-big_red_button' ], 'Pipeline shutdown triggered without errors' );
# Give the worker(s) some time to die
sleep(10);

my $bk_nta = $hive_dba->get_NakedTableAdaptor( 'table_name' => 'beekeeper' );
my $unblocked_beekeeper_rows = $bk_nta->fetch_all( 'cause_of_death IS NULL AND is_blocked != 1' );
is( scalar @{ $unblocked_beekeeper_rows }, 0, 'All non-dead beekeepers have been blocked' );

my $w_nta = $hive_dba->get_NakedTableAdaptor( 'table_name' => 'worker' );
my $alive_worker_rows = $w_nta->fetch_all( "status != 'DEAD'" );
is( scalar @{ $alive_worker_rows }, 0, 'No non-dead workers remaining' );

$hive_dba->dbc->disconnect_if_idle();
run_sql_on_db( $pipeline_url, 'DROP DATABASE' );

done_testing();


1;

0 comments on commit c89a39a

Please sign in to comment.