Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 456 lines (380 sloc) 10.4 KB
#!/usr/bin/perl
use strict;
use Getopt::Long;
use Data::Dumper;
Getopt::Long::Configure(qw(passthrough require_order no_ignore_case));
# TODO: put these in a config file, it can still be perl...for now it's simple enough to edit here
my %nologic = map {($_, 1)} qw(if [ calc echo . alc for export);
# always send to grid engine, no matter how big
my %wrap = map {($_, 1)} qw(bowtie-build tophat juncsdb bwa cufflinks countannot calc-numaligned fastx_quality_stats);
# never send to grid engine, no matter how big
my %nowrap = map {($_, 1)} qw(condor_run run-analysis mv grun qpipe ea);
# always run these, regardless of dependencies - still send to grid if needed
my %nodep = map {($_, 1)} qw();
# this can be condor_run, qsub... whatever you want
my $grid_engine = 'grun';
# alter default 'big' size and parameter parsing
my %info = (
cp => {big=>'20g'},
# input doesn't need to exist, output is calculated from the input
gunzip => {big=>'10g', in=>{_noex=>1}, out=>{_eval=>sub{$_[0]=~m/(.*)\.gz$/; return $1}}},
gzip => {big=>'20g', in=>{_noex=>1}, out=>{_eval=>sub{$_[0].".gz"}}},
'auto-clip.sh' => {out=>{_eval=>sub{$_[0]=~ s/\.(fastq|fq)//; return ($_[0].".qcStats", $_[0].".adapters")}}},
bowtie => {big=>'20m'},
bwa => {big=>'20m'},
tophat => {big=>'10m'},
cufflinks => {big=>'10m', in=>{b=>1, r=>1, G=>1}},
'cufflinks0.9.3' => {big=>'10m', in=>{b=>1, r=>1, G=>1}},
fastx_quality_stats => {in=>{i=>1}, out=>{o=>1}},
samtools => { match => [
{ pat=>qr/sort\s*(?:-n)?\s+(\S+bam)\s*(\S+)/, post=>'$out[0] .= ".bam"' } ,
{ pat=>qr/pileup.*\s(\S+[bs]am)/ } ,
] },
bwasam => { match => [
{ pat=>qr/(?:-o \S+\s+)?(?:\S+)\s+(\S+)/, post=>'$out[1] = "$out[0].sam"; $out[0] = "$out[0].bam"'} ,
] },
# change to asterisk for dependency globbing
'fastq-multx' => {out=>{o=>sub {s/%/\*/}}, in=>{l=>1, g=>1, B=>1}},
'fastq-mcf' => {out=>{o=>1}},
);
our $VERSION = '0.9.46';
my ($debug, $check, $inline, $verbose, $help, $rules_file, $list, $noexec, $passopts, $waitout, $force);
$| = 1;
$waitout=1; # by default we wait for grid jobs output
GetOptions("DEBUG"=>\$debug, "K"=>\$check, "force"=>\$force, "grid"=>\$grid_engine, "waitout"=>\$waitout, "cmd=s"=>\$inline, "noexec"=>\$noexec, "l"=>\$list, "verbose|v"=>\$verbose, "help"=>\$help, "rules=s"=>\$rules_file) || die usage();
$verbose = 1 if $debug;
$noexec = 1 if $debug;
$passopts = '-v' if $verbose;
$passopts .= ' -l' if $list;
$passopts .= ' -D' if $debug;
$passopts .= ' -n' if $noexec;
$passopts .= ' -f' if $force;
if ($help || $ARGV[0] =~ /^-\?/) {
print usage();
exit 0
}
sub reptail {
my ($f, $r) = @_;
$f =~ s/\Q$r\E$//;
return $f;
}
sub repwith {
my ($f, $r, $t) = @_;
$f =~ s/\Q$r\E/$t/;
return $f;
}
if ($check) {
# base executable name
my $orig_cmd = shift @ARGV;
my $cmd = $orig_cmd;
$cmd =~ s/.*\///;
my $cmd_nv = $cmd;
$cmd_nv =~ s/\-[0-9.-]+//;
$cmd_nv = $cmd if $cmd_nv !~ /^[\w-]+$/;
my @argv = @ARGV;
print STDERR "QSH preex '$orig_cmd @argv'\n" if $debug;
for (@argv) {
# bash vars come in unexpanded... expand them
s/\$(\w+)/$ENV{$1}/g;
s/\${(\w+)}/$ENV{$1}/g;
s/\${(\w+)\%([^{}]+)}/reptail($ENV{$1},$2)/ge;
s/\${(\w+)\/([^\/]+)\/([^{}]+)}/repwith($ENV{$1},$2,$3)/ge;
}
print STDERR "QSH top '$cmd @argv'\n" if $debug;
# don't try to alter non executable lines
exit 0 if $cmd !~ /^[\w.-]+$/;
# don't ever check dependencies, or wrap these commands
if ($nologic{$cmd}) {
print STDERR "QSH ignore command '$cmd'\n" if $verbose;
if ($debug) {
print STDERR "QSH debug: '$cmd @ARGV'";
}
exit 0;
}
print STDERR "QSH check command $cmd\n" if $verbose;
# walk through arguments, determining inputs, outputs and total sizes
my ($opt, %opt, $first, $firstnox, $last, @in, @out, $sz);
my $info = $info{$cmd} ? $info{$cmd} : $info{$cmd_nv};
for (@argv) {
if (s/^--?//) {
$opt = $_;
next;
}
if ($opt) {
if ($opt eq '<' || $info->{in}->{$opt}) {
push @in, $_;
}
if ($opt eq '>>' || $opt eq '>' || $info->{out}->{$opt}) {
push @out, $_;
}
$opt = '';
next;
}
$opt = '>' if ($_ eq '>');
$opt = '<' if ($_ eq '<');
$_ =~ s/^~/$ENV{HOME}/;
if (!$firstnox) {
$firstnox = $_
}
if (!$first) {
if ( $info->{in}->{_noex} || -e $_ ) {
$first = $_
}
} else {
$last = $_;
}
$sz += -s $_;
}
push @in, $first if $first && (!@in); # no input ? assume first file that exists
push @in, $firstnox if $firstnox && (!@in); # no input ? assume first file
if ( $info->{match} ) {
my $args = "@ARGV";
for my $m (@{$info->{match}}) {
my @m = $args =~ /$m->{pat}/;
if (@m) {
@in = shift @m;
if (@m) {
@out = @m;
if ($m->{post}) {
eval($m->{post});
}
}
}
}
}
if ($info->{post}) {
eval($info->{post});
}
if ( $info->{in}->{_noin} ) {
@in = ();
$last = $first;
}
if ($info->{out}->{_eval}) {
my ($i, $o) = ($in[0], $out[0]);
@out = &{$info->{out}->{_eval}}($i, $o);
print STDERR "QSH eval $cmd: @out\n" if $verbose;
if ($@) {
print STDERR "QSH error: $@\n";
}
}
push @out, $last if $last && (!@out); # no output ? assume last argument
pop @out if $out[0] eq $in[0]; # no in's = outs
print STDERR "QSH check: $cmd : sz: $sz, in: @in, out: @out\n" if $verbose;
my $mx;
for (@in) {
print STDERR "$_\n" if $list;
my $m = fmodtime($_);
$mx = $m if ($m > $mx);
}
my ($nox, $need);
for (@out) {
if ($_ =~ /\*/) {
# TODO: fileglob all outputs, pick oldest, use that.
}
if ($mx < fmodtime($_)) {
$nox=1;
} else {
$need=1;
}
}
if (!$need && $nox && !$force) {
print STDERR "QSH suppress: @out, already done\n" if $verbose || $noexec || $debug;
print "true";
} else {
my $wrap = $wrap{$cmd} || $wrap{$cmd_nv};
my $big = $info->{big};
if (!$big) {
$big = '50m';
}
if (!$wrap) {
my $t = $big;
$t *= 1000000 if ($t =~ /m$/i);
$t *= 1000000000 if ($t =~ /g$/i);
$wrap = $sz >= $t;
}
# never submit a job to the condor, if we're already in condor
$wrap = 0 if $ENV{"_CONDOR_SLOT"};
$wrap = 0 if $nowrap{$cmd};
$wrap = 0 if $cmd eq $grid_engine;
# || $ENV{"_GRUN"}; # grun's more friendly about sub-submits... not sure whether to allow?
if ($list) {
print STDERR `which $orig_cmd 2>/dev/null`;
}
my $suff = "#>\"" . join('" "',@out) . "\"";
$suff .= "#wait" if !$debug && ($wrap || ($cmd eq $grid_engine));
print STDERR "QSH debug: wait '$out[0]' " . ($wrap || ($cmd eq $grid_engine)) . "\n" if $debug;
if ($wrap) {
print STDERR "QSH wrap $orig_cmd, input size:$sz big:$big\n" if $verbose;
quoteargv();
if ($list) {
print "false";
} else {
print "$grid_engine \"$orig_cmd @ARGV\" $suff";
}
} else {
print STDERR "QSH no wrap $orig_cmd @ARGV, input size $sz big: $big\n" if $verbose;
if ($list) {
print "false";
} elsif ($suff) {
print "$orig_cmd @ARGV $suff";
}
}
}
exit 0;
}
sub bash_prefix;
#TODO ... would be nice if this could be interactive...
# but there's a whole TTY attach/reattach thing I can't figure out
open STDERR, "|sort|uniq\n" if $list;
my $in;
if (!$inline) {
$in = shift @ARGV;
if (!$in) {
die "qsh: missing input file operand\n"
}
open(IN, $in) || die "qsh: can't open $in: $!\n";
}
no strict 'refs';
if (pipe_to_fork('FOO')) {
print STDERR "QSH start\n" if $verbose;
use strict;
# parent
print FOO bash_prefix();
if ($inline) {
print FOO "\n$inline\n";
} else {
while ( <IN> ) {
print FOO $_
}
}
close FOO;
while (wait > 1) {};
} else {
quoteargv();
exec("bash -s @ARGV");
while (wait > 1) {};
}
sub quoteargv {
for (@ARGV) {
s/(['"()])/\\$1/g;
}
}
sub fmodtime {
return (stat($_[0]))[9];
}
sub bash_prefix {
my $r;
my $wx;
my ($won, $woff);
$wx = '[ -n "$wait" ] && qsh_waitforoutput $out' if ($waitout);
$won = 'wait=1' if $waitout;
$woff = 'wait=' if $waitout;
my $dx;
$dx = 'echo QSH debug: Cmd=$cmd Out=$out Wait=$wait 1>&2' if $debug;
$dx = 'echo "$cmd"' if $noexec && !$debug;
$r = "POPTS=\"$passopts\"\n" if $passopts;
$r .= <<EOF;
function qsh_on_debug {
if [ -n "\$BASH_COMMAND" ]; then
cmd=`$0 -K \$POPTS \$BASH_COMMAND`
if [[ \$cmd =~ "(.*)#>(.*)" ]]; then
cmd=\${BASH_REMATCH[1]}
out=\${BASH_REMATCH[2]}
fi
$woff
if [[ \$out =~ "(.*)#wait" ]]; then
out=\${BASH_REMATCH[1]}
$won
fi
$dx
if [ -n "\$cmd" ]; then
if [ -n "\$out" ]; then
if [ -z "$noexec" ]; then
echo "+ \$cmd" 1>&2
bash -c "\$cmd" || qsh_rmdie \$out
$wx
fi
else
if [ -z "$noexec" ]; then
echo "+ \$cmd" 1>&2
\$cmd
fi
fi
false
fi
fi
}
function qsh_rmdie {
if [ ! "" == "\$*" ]; then
for xout in \$*; do
if [ -n \$xout ]; then
echo QSH removing \$xout 1>&2
fi
rm -rf \$xout
done
fi
exit 1
}
function qsh_waitforoutput {
i=0
while [ \$i -lt 80 ]
do
(( i++ ))
if [ -z "\$1" ]; then
break;
fi
if [ -e "\$1" ]; then
break;
fi
sleep .25
done
}
shopt -s extdebug
set -o functrace
set -a
trap qsh_on_debug DEBUG
EOF
return $r;
die $r;
}
sub pipe_to_fork ($) {
my $parent = shift;
pipe my $child, $parent or die;
my $pid = fork();
die "fork() failed: $!" unless defined $pid;
if ($pid) {
close $child;
}
else {
close $parent;
open(STDIN, "<&=" . fileno($child)) or die;
}
$pid;
}
sub usage {
<<EOF
qsh version $VERSION
Usage: qsh [options] <bash-script>
or: qsh [options] -c "commands to be run"
Options:
-d|ebug debug
-l|ist guess list of dependencies
-v|erbose verbose output to STDERR
-h|elp show this help
-r|ules FILE use FILE instead of /etc/qsh.conf
Description:
This is a wrapper around 'bash' that passes each command executed
to an evaluator that determines whether the command should run
based on it's parsed inputs and outputs.
In addition, the wrapper determines if the command should be
sent to a "grid engine" based on the size of the inputs and
the type of command.
The purpose is to enable "fluid" development of simple pipeline
scripts without having to recode them into Makefiles or other
even more cumbersome DAG scripts, etc.
Problems:
- It's inevitable that qsh might get some things wrong,
and prevent maximum efficiency.
- It's often better to carefully plan workflows
EOF
}