Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 408 lines (337 sloc) 12.2 KB
#!/usr/bin/perl
####
#Copyright (c) 2012 Erik Aronesty (erik@q32.com)
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in
#all copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#THE SOFTWARE.
#
#ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT.
#
use strict;
use POSIX qw(mkfifo);
use Data::Dumper;
use Getopt::Long qw(:config no_ignore_case);
my %opt;
GetOptions(\%opt, "sort|s", "fai=s", "noindel", "strand|S=s", "keep", "debug");
my $f = shift @ARGV;
my $o = shift @ARGV;
my $keep = $opt{keep};
die "Usage: $0 [options] <input-bam> <output-bam>
Convert a bam file into a file similar to bowtie output
This is suitable for passing to RSEM if (-n, -f) are used,
or to EXPRESS (without -n or -f).
-s sort by id, use if the source bam is not sorted by id
-n no indels ... removes indels and soft-clip information
from the cigar string... ruining the validity of the MD tag
and any variation results... but good for pipelines (like
rna counting) that don't use/want this info
" unless $f && $o =~ /bam$/;
my %FAI;
if ($opt{fai}) {
warn "Obsolete argument -f, ignoring\n";
}
open IN, $f =~ /\.bam$/ ? "samtools view -h $f |" : "$f";
if ($opt{sort} && $f !~ /\.bam$/) {
die "Specify presorted or use a regular bam file\n";
}
my ($pe, $cnt, @lens, @IN);
# first 5000 proper aligned reads... make an insert size
while (<IN>) {
# save to stack
if (/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/ ) {
$FAI{$1}=$2+0;
}
push @IN, $_;
my ($id, $bits, $nmo, $pos, $qual, $cig, $f1, $f2, $f3, $seq, $qseq, @fdx) = split /\t/, $_;
next unless $pos > 0;
++$cnt;
if (abs($f3)>0) {
if ($f3 > 0) {
push @lens, $f3;
}
$pe = 1;
}
last if $cnt > 5000;
}
$opt{fai} = "-";
# trimmed mean
@lens=sort {$a<=>$b} (@lens);
$cnt = 0;
my $insert = 0;
for(my $i=(@lens*.10);$i<(@lens*.90);++$i) {
$insert+=$lens[$i]; $cnt+=1;
}
if (!$cnt && $pe) {
die "No paired alignments in '$f' quitting\n";
}
if (!$cnt) {
$insert="n/a";
} else {
$insert/=$cnt;
}
warn "Insert size: $insert\n" if $opt{debug};
if ($opt{sort}) {
die "-sort only if input is bam\n" unless $f =~ /\.bam$/;
my $sb = $f;
$sb =~ s/\.bam$//;
$sb .= ".ntmp";
unlink("$sb.bam");
warn("+mkfifo $sb.bam\n");
if (!mkfifo("$sb.bam", 0664)) {
die "Can't create fifo $sb.bam: $!\n";
}
if (!fork()) {
# write to sb.bam fifo in a fork
system("samtools sort -n $f $sb");
exit(0);
}
# now read from the id-sorted bam, instead of the chromosome-sorted bam
$f = "$sb.bam";
close IN;
@IN=();
open IN, $f =~ /\.bam$/ ? "samtools view $f |" : "$f";
}
my (@m);
sub saveout {
my ($out, $mate, $nmo, $pos, $len, $isrev, $origid, $cig) = @_;
if ($pe) {
if ($mate == 0) {
if ($m[1]) {
# first pair is bwa's default
my $f1 = shift @{$m[0]};
my $f2 = shift @{$m[1]};
if ($f1->[1] eq $f2->[1] ) {
if (!($f1->[1] eq '*') && !($f2->[1] eq '*') && !($f1->[6] eq '*') && !($f2->[6] eq '*')) {
if (((($f1->[2]+$f1->[3]) < $FAI{$f1->[1]}) &&
(($f2->[2]+$f2->[3]) < $FAI{$f2->[1]}))) {
# only output proper pair, with '*' nmo's and cigs, and where both alignments are within the fai
print OUT $f1->[0];
print OUT $f2->[0];
} else {
# warn discard?
}
} else {
if ($keep && (($f1->[1] eq '*') && ($f2->[1] eq '*'))) {
print OUT $f1->[0];
print OUT $f2->[0];
} else {
# warn discard
}
}
}
if (@{$m[0]} && @{$m[1]}) {
# prune proper ... only matching mates
my %have;
for (@{$m[0]}) {
next if $_->[6] eq '*';
next if $opt{fai} && (($_->[2]+$_->[3]) > $FAI{$_->[1]});
$have{$_->[1]}=1;
}
for (@{$m[1]}) {
next if $_->[6] eq '*';
next if $opt{fai} && (($_->[2]+$_->[3]) > $FAI{$_->[1]});
$have{$_->[1]}=2 if $have{$_->[1]} == 1;
}
%have = map { $have{$_}==2 ? ($_=>1) : () } keys(%have);
$have{'*'} = undef;
my (@m1, @m0);
# only keep good alignments
for (@{$m[0]}) {
next if $opt{fai} && (($_->[2]+$_->[3]) > $FAI{$_->[1]});
push @m0, $_ if $have{$_->[1]} && !($_->[6] eq '*');
}
for (@{$m[1]}) {
next if $opt{fai} && (($_->[2]+$_->[3]) > $FAI{$_->[1]});
push @m1, $_ if $have{$_->[1]} && !($_->[6] eq '*');
}
@{$m[0]}=@m0;
@{$m[1]}=@m1;
my %taken;
@m0 = ();
@m1 = ();
# now pick best mate from the pruned set (first part above not really necessary, but might speed things up)
for my $a (@{$m[0]}) {
my $min = 1000000;
my $best;
for my $b (@{$m[1]}) {
if ($a->[1] eq $b->[1]) {
# distance closest to true insert size
my $dist = abs(abs($a->[2]-$b->[2])+$b->[3]-$insert);
if ($dist < $min && !$taken{scalar $b} && $dist < $insert) {
# if distance no more than double insert size, then it's OK (bwa should filter, but doesn't always)
$min=$dist;
$best=$b;
}
}
}
if ($best) {
# found a mate? output it
$taken{scalar $best}=1;
push @m0, $a;
push @m1, $best;
}
}
@{$m[0]}=@m0;
@{$m[1]}=@m1;
for (my $i=0;$i<@{$m[0]};++$i) {
if ($m[0][$i][5] gt $m[1][$i][5]) {
# swap mates so they are in the ORIGINAL-ID (read1, read2) order
my @tmp = @{$m[0][$i]};
@{$m[0][$i]} = @{$m[1][$i]};
@{$m[1][$i]} = @tmp;
}
my ($out1, $nmo1, $pos1, $len1, $isrev1, $origid, $cig) = @{$m[0][$i]};
my ($out2, $nmo2, $pos2, $len2, $isrev2, $origid, $cig) = @{$m[1][$i]};
# replace insert-size and mate-pos (slow!)
my $dist1 = $pos2-$pos1-$len2 if $isrev1 && $pos2 && $pos1;
$dist1 = $pos2-$pos1+$len2 if !$isrev1 && $pos2 && $pos1;
$out1 = replacetab($out1, 8, $dist1);
$out1 = replacetab($out1, 7, $pos2);
print OUT $out1;
my $dist2 = -$dist1;
$out2 = replacetab($out2, 8, $dist2);
$out2 = replacetab($out2, 7, $pos1);
print OUT $out2;
}
}
@m = ();
}
}
push @{$m[$mate]}, [$out, $nmo, $pos, $len, $isrev, $origid, $cig];
} else {
# only print clearly good or clearly unaligned
print OUT $out if ($keep && $nmo eq '*') || ((!($nmo eq '*')) && (!($cig eq '*')) && (($pos+$len) < $FAI{$nmo}));
# warn discard?
}
}
warn "Reading '$f', writing to '$o'\n" if $opt{debug};
#open OUT, "|samtools view -S -b - > $o 2> /dev/null";
open OUT, "|samtools view -S -b - > $o";
my $mate = 1;
my $previd;
while (1){
if (@IN) {
# pop stack
$_ = shift @IN;
} else {
$_ = <IN>;
}
if (!$_) {
warn "Finished reading $f\n" if $opt{debug};
last;
}
next if /^\@PG/ && $opt{noindel};
print(OUT) && next if /^\@/;
chomp;
$mate = !$mate;
my ($id, $bits, $nmo, $pos, $qual, $cig, $f1, $f2, $f3, $seq, $qseq, @fdx) = split /\t/, $_;
my $origid = $id;
if (!$pe && ($cig eq '*' || $nmo eq '*')) {
print OUT $_, "\n" if $keep;
next;
}
# ensure progression even if errors in code
eval {
if ($pe && $mate && $previd) {
# paired-end read id's have to match each other ... IE: bowtie output
$_ = replacetab($_, 0, $previd);
$id = $previd;
}
# previd set
$previd = $id;
my ($xa, $nm);
for (@fdx) {
# get rid of XA tag
if (s/^XA:Z://) {
$xa = $_;
$_ = '';
}
# get rid of NM tag
if (s/^NM:i://) {
$nm = $_;
$_ = '';
}
}
if ($opt{noindel}) {
# clean cig : todo: maybe remove innacurate MD tag? or keep it because it *was* OK?
$cig =~ s/(\d+)I/\1M/g;
$cig =~ s/(\d+)D//g;
$cig =~ s/(\d+)S/\1M/g;
while ($cig =~ s/(\d+)M(\d+)M/($1+$2)."M"/e) {};
}
$_=replacetab($_, 5, $cig);
saveout($_."\n", $mate, $nmo, $pos, length($seq), $bits & 16, $origid, $cig) && next if !$xa;
# remove XA tag
$_ =~ s/\tXA:Z:[^\t]+//;
saveout($_."\n", $mate, $nmo, $pos, length($seq), $bits & 16, $origid, $cig);
my $isrev = $bits & 16;
for (split /;/, $xa) {
# for each x alignment
my ($nmo, $pos, $cig, $mm) = m/(.*),([^,]+),([^,]+),([^,]+)/;
# set reverse bits as appropriate
$bits = $bits & ~16;
$bits = $bits | 16 if $pos =~ /^-/;
if ($pos =~ /^-/) {
# mate not rev
$bits = $bits & ~0x020;
} else {
# mate rev
$bits = $bits | 0x020;
}
if ($pe) {
$bits = $bits & ~0x040;
$bits = $bits & ~0x080;
$bits = $bits | 0x040 if !$mate;
$bits = $bits | 0x080 if $mate;
}
my $s = $seq;
my $q = $qseq;
if ( ($bits & 16) != $isrev) {
# reverse sequence if reverse alignment
$s=revcomp($s);
$q=reverse($q);
}
if ($opt{noindel}) {
# clean cigar of all indels/soft-clips
$cig =~ s/(\d+)I/\1M/g;
$cig =~ s/(\d+)D//g;
$cig =~ s/(\d+)S/\1M/g;
while ($cig =~ s/(\d+)M(\d+)M/($1+$2)."M"/e) {};
}
$pos =~ s/^[+-]//;
my @tmp=@fdx;
push @tmp, "NM:i:$nm";
@tmp = grep /:/, @tmp;
my $fdx = join "\t", @tmp;
# save output
saveout("$id\t$bits\t$nmo\t$pos\t$qual\t$cig\t$f1\t$f2\t$f3\t$s\t$q\t$fdx\n", $mate, $nmo, $pos, length($seq), $bits & 16, $origid, $cig);
}
}}
$mate = !$mate;
saveout("",$mate) if $mate == 0;
sub revcomp {
my $r = reverse(shift @_);
$r =~ tr/ACGT/TGCA/;
return $r;
}
sub replacetab {
my ($str, $p, $rep) = @_;
$p;
$str=~ s/((?:[^\t]+\t){$p})[^\t]+\t/$1$rep\t/;
return $str;
}