Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/usr/bin/perl | |
| use strict; | |
| use Getopt::Long; | |
| use Data::Dumper; | |
| Getopt::Long::Configure(qw(passthrough require_order no_ignore_case)); | |
| # TODO: put these in a config file, it can still be perl...for now it's simple enough to edit here | |
| my %nologic = map {($_, 1)} qw(if [ calc echo . alc for export); | |
| # always send to grid engine, no matter how big | |
| my %wrap = map {($_, 1)} qw(bowtie-build tophat juncsdb bwa cufflinks countannot calc-numaligned fastx_quality_stats); | |
| # never send to grid engine, no matter how big | |
| my %nowrap = map {($_, 1)} qw(condor_run run-analysis mv grun qpipe ea); | |
| # always run these, regardless of dependencies - still send to grid if needed | |
| my %nodep = map {($_, 1)} qw(); | |
| # this can be condor_run, qsub... whatever you want | |
| my $grid_engine = 'grun'; | |
| # alter default 'big' size and parameter parsing | |
| my %info = ( | |
| cp => {big=>'20g'}, | |
| # input doesn't need to exist, output is calculated from the input | |
| gunzip => {big=>'10g', in=>{_noex=>1}, out=>{_eval=>sub{$_[0]=~m/(.*)\.gz$/; return $1}}}, | |
| gzip => {big=>'20g', in=>{_noex=>1}, out=>{_eval=>sub{$_[0].".gz"}}}, | |
| 'auto-clip.sh' => {out=>{_eval=>sub{$_[0]=~ s/\.(fastq|fq)//; return ($_[0].".qcStats", $_[0].".adapters")}}}, | |
| bowtie => {big=>'20m'}, | |
| bwa => {big=>'20m'}, | |
| tophat => {big=>'10m'}, | |
| cufflinks => {big=>'10m', in=>{b=>1, r=>1, G=>1}}, | |
| 'cufflinks0.9.3' => {big=>'10m', in=>{b=>1, r=>1, G=>1}}, | |
| fastx_quality_stats => {in=>{i=>1}, out=>{o=>1}}, | |
| samtools => { match => [ | |
| { pat=>qr/sort\s*(?:-n)?\s+(\S+bam)\s*(\S+)/, post=>'$out[0] .= ".bam"' } , | |
| { pat=>qr/pileup.*\s(\S+[bs]am)/ } , | |
| ] }, | |
| bwasam => { match => [ | |
| { pat=>qr/(?:-o \S+\s+)?(?:\S+)\s+(\S+)/, post=>'$out[1] = "$out[0].sam"; $out[0] = "$out[0].bam"'} , | |
| ] }, | |
| # change to asterisk for dependency globbing | |
| 'fastq-multx' => {out=>{o=>sub {s/%/\*/}}, in=>{l=>1, g=>1, B=>1}}, | |
| 'fastq-mcf' => {out=>{o=>1}}, | |
| ); | |
| our $VERSION = '0.9.46'; | |
| my ($debug, $check, $inline, $verbose, $help, $rules_file, $list, $noexec, $passopts, $waitout, $force); | |
| $| = 1; | |
| $waitout=1; # by default we wait for grid jobs output | |
| GetOptions("DEBUG"=>\$debug, "K"=>\$check, "force"=>\$force, "grid"=>\$grid_engine, "waitout"=>\$waitout, "cmd=s"=>\$inline, "noexec"=>\$noexec, "l"=>\$list, "verbose|v"=>\$verbose, "help"=>\$help, "rules=s"=>\$rules_file) || die usage(); | |
| $verbose = 1 if $debug; | |
| $noexec = 1 if $debug; | |
| $passopts = '-v' if $verbose; | |
| $passopts .= ' -l' if $list; | |
| $passopts .= ' -D' if $debug; | |
| $passopts .= ' -n' if $noexec; | |
| $passopts .= ' -f' if $force; | |
| if ($help || $ARGV[0] =~ /^-\?/) { | |
| print usage(); | |
| exit 0 | |
| } | |
| sub reptail { | |
| my ($f, $r) = @_; | |
| $f =~ s/\Q$r\E$//; | |
| return $f; | |
| } | |
| sub repwith { | |
| my ($f, $r, $t) = @_; | |
| $f =~ s/\Q$r\E/$t/; | |
| return $f; | |
| } | |
| if ($check) { | |
| # base executable name | |
| my $orig_cmd = shift @ARGV; | |
| my $cmd = $orig_cmd; | |
| $cmd =~ s/.*\///; | |
| my $cmd_nv = $cmd; | |
| $cmd_nv =~ s/\-[0-9.-]+//; | |
| $cmd_nv = $cmd if $cmd_nv !~ /^[\w-]+$/; | |
| my @argv = @ARGV; | |
| print STDERR "QSH preex '$orig_cmd @argv'\n" if $debug; | |
| for (@argv) { | |
| # bash vars come in unexpanded... expand them | |
| s/\$(\w+)/$ENV{$1}/g; | |
| s/\${(\w+)}/$ENV{$1}/g; | |
| s/\${(\w+)\%([^{}]+)}/reptail($ENV{$1},$2)/ge; | |
| s/\${(\w+)\/([^\/]+)\/([^{}]+)}/repwith($ENV{$1},$2,$3)/ge; | |
| } | |
| print STDERR "QSH top '$cmd @argv'\n" if $debug; | |
| # don't try to alter non executable lines | |
| exit 0 if $cmd !~ /^[\w.-]+$/; | |
| # don't ever check dependencies, or wrap these commands | |
| if ($nologic{$cmd}) { | |
| print STDERR "QSH ignore command '$cmd'\n" if $verbose; | |
| if ($debug) { | |
| print STDERR "QSH debug: '$cmd @ARGV'"; | |
| } | |
| exit 0; | |
| } | |
| print STDERR "QSH check command $cmd\n" if $verbose; | |
| # walk through arguments, determining inputs, outputs and total sizes | |
| my ($opt, %opt, $first, $firstnox, $last, @in, @out, $sz); | |
| my $info = $info{$cmd} ? $info{$cmd} : $info{$cmd_nv}; | |
| for (@argv) { | |
| if (s/^--?//) { | |
| $opt = $_; | |
| next; | |
| } | |
| if ($opt) { | |
| if ($opt eq '<' || $info->{in}->{$opt}) { | |
| push @in, $_; | |
| } | |
| if ($opt eq '>>' || $opt eq '>' || $info->{out}->{$opt}) { | |
| push @out, $_; | |
| } | |
| $opt = ''; | |
| next; | |
| } | |
| $opt = '>' if ($_ eq '>'); | |
| $opt = '<' if ($_ eq '<'); | |
| $_ =~ s/^~/$ENV{HOME}/; | |
| if (!$firstnox) { | |
| $firstnox = $_ | |
| } | |
| if (!$first) { | |
| if ( $info->{in}->{_noex} || -e $_ ) { | |
| $first = $_ | |
| } | |
| } else { | |
| $last = $_; | |
| } | |
| $sz += -s $_; | |
| } | |
| push @in, $first if $first && (!@in); # no input ? assume first file that exists | |
| push @in, $firstnox if $firstnox && (!@in); # no input ? assume first file | |
| if ( $info->{match} ) { | |
| my $args = "@ARGV"; | |
| for my $m (@{$info->{match}}) { | |
| my @m = $args =~ /$m->{pat}/; | |
| if (@m) { | |
| @in = shift @m; | |
| if (@m) { | |
| @out = @m; | |
| if ($m->{post}) { | |
| eval($m->{post}); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| if ($info->{post}) { | |
| eval($info->{post}); | |
| } | |
| if ( $info->{in}->{_noin} ) { | |
| @in = (); | |
| $last = $first; | |
| } | |
| if ($info->{out}->{_eval}) { | |
| my ($i, $o) = ($in[0], $out[0]); | |
| @out = &{$info->{out}->{_eval}}($i, $o); | |
| print STDERR "QSH eval $cmd: @out\n" if $verbose; | |
| if ($@) { | |
| print STDERR "QSH error: $@\n"; | |
| } | |
| } | |
| push @out, $last if $last && (!@out); # no output ? assume last argument | |
| pop @out if $out[0] eq $in[0]; # no in's = outs | |
| print STDERR "QSH check: $cmd : sz: $sz, in: @in, out: @out\n" if $verbose; | |
| my $mx; | |
| for (@in) { | |
| print STDERR "$_\n" if $list; | |
| my $m = fmodtime($_); | |
| $mx = $m if ($m > $mx); | |
| } | |
| my ($nox, $need); | |
| for (@out) { | |
| if ($_ =~ /\*/) { | |
| # TODO: fileglob all outputs, pick oldest, use that. | |
| } | |
| if ($mx < fmodtime($_)) { | |
| $nox=1; | |
| } else { | |
| $need=1; | |
| } | |
| } | |
| if (!$need && $nox && !$force) { | |
| print STDERR "QSH suppress: @out, already done\n" if $verbose || $noexec || $debug; | |
| print "true"; | |
| } else { | |
| my $wrap = $wrap{$cmd} || $wrap{$cmd_nv}; | |
| my $big = $info->{big}; | |
| if (!$big) { | |
| $big = '50m'; | |
| } | |
| if (!$wrap) { | |
| my $t = $big; | |
| $t *= 1000000 if ($t =~ /m$/i); | |
| $t *= 1000000000 if ($t =~ /g$/i); | |
| $wrap = $sz >= $t; | |
| } | |
| # never submit a job to the condor, if we're already in condor | |
| $wrap = 0 if $ENV{"_CONDOR_SLOT"}; | |
| $wrap = 0 if $nowrap{$cmd}; | |
| $wrap = 0 if $cmd eq $grid_engine; | |
| # || $ENV{"_GRUN"}; # grun's more friendly about sub-submits... not sure whether to allow? | |
| if ($list) { | |
| print STDERR `which $orig_cmd 2>/dev/null`; | |
| } | |
| my $suff = "#>\"" . join('" "',@out) . "\""; | |
| $suff .= "#wait" if !$debug && ($wrap || ($cmd eq $grid_engine)); | |
| print STDERR "QSH debug: wait '$out[0]' " . ($wrap || ($cmd eq $grid_engine)) . "\n" if $debug; | |
| if ($wrap) { | |
| print STDERR "QSH wrap $orig_cmd, input size:$sz big:$big\n" if $verbose; | |
| quoteargv(); | |
| if ($list) { | |
| print "false"; | |
| } else { | |
| print "$grid_engine \"$orig_cmd @ARGV\" $suff"; | |
| } | |
| } else { | |
| print STDERR "QSH no wrap $orig_cmd @ARGV, input size $sz big: $big\n" if $verbose; | |
| if ($list) { | |
| print "false"; | |
| } elsif ($suff) { | |
| print "$orig_cmd @ARGV $suff"; | |
| } | |
| } | |
| } | |
| exit 0; | |
| } | |
| sub bash_prefix; | |
| #TODO ... would be nice if this could be interactive... | |
| # but there's a whole TTY attach/reattach thing I can't figure out | |
| open STDERR, "|sort|uniq\n" if $list; | |
| my $in; | |
| if (!$inline) { | |
| $in = shift @ARGV; | |
| if (!$in) { | |
| die "qsh: missing input file operand\n" | |
| } | |
| open(IN, $in) || die "qsh: can't open $in: $!\n"; | |
| } | |
| no strict 'refs'; | |
| if (pipe_to_fork('FOO')) { | |
| print STDERR "QSH start\n" if $verbose; | |
| use strict; | |
| # parent | |
| print FOO bash_prefix(); | |
| if ($inline) { | |
| print FOO "\n$inline\n"; | |
| } else { | |
| while ( <IN> ) { | |
| print FOO $_ | |
| } | |
| } | |
| close FOO; | |
| while (wait > 1) {}; | |
| } else { | |
| quoteargv(); | |
| exec("bash -s @ARGV"); | |
| while (wait > 1) {}; | |
| } | |
| sub quoteargv { | |
| for (@ARGV) { | |
| s/(['"()])/\\$1/g; | |
| } | |
| } | |
| sub fmodtime { | |
| return (stat($_[0]))[9]; | |
| } | |
| sub bash_prefix { | |
| my $r; | |
| my $wx; | |
| my ($won, $woff); | |
| $wx = '[ -n "$wait" ] && qsh_waitforoutput $out' if ($waitout); | |
| $won = 'wait=1' if $waitout; | |
| $woff = 'wait=' if $waitout; | |
| my $dx; | |
| $dx = 'echo QSH debug: Cmd=$cmd Out=$out Wait=$wait 1>&2' if $debug; | |
| $dx = 'echo "$cmd"' if $noexec && !$debug; | |
| $r = "POPTS=\"$passopts\"\n" if $passopts; | |
| $r .= <<EOF; | |
| function qsh_on_debug { | |
| if [ -n "\$BASH_COMMAND" ]; then | |
| cmd=`$0 -K \$POPTS \$BASH_COMMAND` | |
| if [[ \$cmd =~ "(.*)#>(.*)" ]]; then | |
| cmd=\${BASH_REMATCH[1]} | |
| out=\${BASH_REMATCH[2]} | |
| fi | |
| $woff | |
| if [[ \$out =~ "(.*)#wait" ]]; then | |
| out=\${BASH_REMATCH[1]} | |
| $won | |
| fi | |
| $dx | |
| if [ -n "\$cmd" ]; then | |
| if [ -n "\$out" ]; then | |
| if [ -z "$noexec" ]; then | |
| echo "+ \$cmd" 1>&2 | |
| bash -c "\$cmd" || qsh_rmdie \$out | |
| $wx | |
| fi | |
| else | |
| if [ -z "$noexec" ]; then | |
| echo "+ \$cmd" 1>&2 | |
| \$cmd | |
| fi | |
| fi | |
| false | |
| fi | |
| fi | |
| } | |
| function qsh_rmdie { | |
| if [ ! "" == "\$*" ]; then | |
| for xout in \$*; do | |
| if [ -n \$xout ]; then | |
| echo QSH removing \$xout 1>&2 | |
| fi | |
| rm -rf \$xout | |
| done | |
| fi | |
| exit 1 | |
| } | |
| function qsh_waitforoutput { | |
| i=0 | |
| while [ \$i -lt 80 ] | |
| do | |
| (( i++ )) | |
| if [ -z "\$1" ]; then | |
| break; | |
| fi | |
| if [ -e "\$1" ]; then | |
| break; | |
| fi | |
| sleep .25 | |
| done | |
| } | |
| shopt -s extdebug | |
| set -o functrace | |
| set -a | |
| trap qsh_on_debug DEBUG | |
| EOF | |
| return $r; | |
| die $r; | |
| } | |
| sub pipe_to_fork ($) { | |
| my $parent = shift; | |
| pipe my $child, $parent or die; | |
| my $pid = fork(); | |
| die "fork() failed: $!" unless defined $pid; | |
| if ($pid) { | |
| close $child; | |
| } | |
| else { | |
| close $parent; | |
| open(STDIN, "<&=" . fileno($child)) or die; | |
| } | |
| $pid; | |
| } | |
| sub usage { | |
| <<EOF | |
| qsh version $VERSION | |
| Usage: qsh [options] <bash-script> | |
| or: qsh [options] -c "commands to be run" | |
| Options: | |
| -d|ebug debug | |
| -l|ist guess list of dependencies | |
| -v|erbose verbose output to STDERR | |
| -h|elp show this help | |
| -r|ules FILE use FILE instead of /etc/qsh.conf | |
| Description: | |
| This is a wrapper around 'bash' that passes each command executed | |
| to an evaluator that determines whether the command should run | |
| based on it's parsed inputs and outputs. | |
| In addition, the wrapper determines if the command should be | |
| sent to a "grid engine" based on the size of the inputs and | |
| the type of command. | |
| The purpose is to enable "fluid" development of simple pipeline | |
| scripts without having to recode them into Makefiles or other | |
| even more cumbersome DAG scripts, etc. | |
| Problems: | |
| - It's inevitable that qsh might get some things wrong, | |
| and prevent maximum efficiency. | |
| - It's often better to carefully plan workflows | |
| EOF | |
| } | |