Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/usr/bin/perl | |
| #### | |
| #Copyright (c) 2012 Erik Aronesty (erik@q32.com) | |
| # | |
| #Permission is hereby granted, free of charge, to any person obtaining a copy | |
| #of this software and associated documentation files (the "Software"), to deal | |
| #in the Software without restriction, including without limitation the rights | |
| #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| #copies of the Software, and to permit persons to whom the Software is | |
| #furnished to do so, subject to the following conditions: | |
| # | |
| #The above copyright notice and this permission notice shall be included in | |
| #all copies or substantial portions of the Software. | |
| # | |
| #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| #THE SOFTWARE. | |
| # | |
| #ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT. | |
| # | |
| use strict; | |
| use POSIX qw(mkfifo); | |
| use Data::Dumper; | |
| use Getopt::Long qw(:config no_ignore_case); | |
| my %opt; | |
| GetOptions(\%opt, "sort|s", "fai=s", "noindel", "strand|S=s", "keep", "debug"); | |
| my $f = shift @ARGV; | |
| my $o = shift @ARGV; | |
| my $keep = $opt{keep}; | |
| die "Usage: $0 [options] <input-bam> <output-bam> | |
| Convert a bam file into a file similar to bowtie output | |
| This is suitable for passing to RSEM if (-n, -f) are used, | |
| or to EXPRESS (without -n or -f). | |
| -s sort by id, use if the source bam is not sorted by id | |
| -n no indels ... removes indels and soft-clip information | |
| from the cigar string... ruining the validity of the MD tag | |
| and any variation results... but good for pipelines (like | |
| rna counting) that don't use/want this info | |
| " unless $f && $o =~ /bam$/; | |
| my %FAI; | |
| if ($opt{fai}) { | |
| warn "Obsolete argument -f, ignoring\n"; | |
| } | |
| open IN, $f =~ /\.bam$/ ? "samtools view -h $f |" : "$f"; | |
| if ($opt{sort} && $f !~ /\.bam$/) { | |
| die "Specify presorted or use a regular bam file\n"; | |
| } | |
| my ($pe, $cnt, @lens, @IN); | |
| # first 5000 proper aligned reads... make an insert size | |
| while (<IN>) { | |
| # save to stack | |
| if (/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/ ) { | |
| $FAI{$1}=$2+0; | |
| } | |
| push @IN, $_; | |
| my ($id, $bits, $nmo, $pos, $qual, $cig, $f1, $f2, $f3, $seq, $qseq, @fdx) = split /\t/, $_; | |
| next unless $pos > 0; | |
| ++$cnt; | |
| if (abs($f3)>0) { | |
| if ($f3 > 0) { | |
| push @lens, $f3; | |
| } | |
| $pe = 1; | |
| } | |
| last if $cnt > 5000; | |
| } | |
| $opt{fai} = "-"; | |
| # trimmed mean | |
| @lens=sort {$a<=>$b} (@lens); | |
| $cnt = 0; | |
| my $insert = 0; | |
| for(my $i=(@lens*.10);$i<(@lens*.90);++$i) { | |
| $insert+=$lens[$i]; $cnt+=1; | |
| } | |
| if (!$cnt && $pe) { | |
| die "No paired alignments in '$f' quitting\n"; | |
| } | |
| if (!$cnt) { | |
| $insert="n/a"; | |
| } else { | |
| $insert/=$cnt; | |
| } | |
| warn "Insert size: $insert\n" if $opt{debug}; | |
| if ($opt{sort}) { | |
| die "-sort only if input is bam\n" unless $f =~ /\.bam$/; | |
| my $sb = $f; | |
| $sb =~ s/\.bam$//; | |
| $sb .= ".ntmp"; | |
| unlink("$sb.bam"); | |
| warn("+mkfifo $sb.bam\n"); | |
| if (!mkfifo("$sb.bam", 0664)) { | |
| die "Can't create fifo $sb.bam: $!\n"; | |
| } | |
| if (!fork()) { | |
| # write to sb.bam fifo in a fork | |
| system("samtools sort -n $f $sb"); | |
| exit(0); | |
| } | |
| # now read from the id-sorted bam, instead of the chromosome-sorted bam | |
| $f = "$sb.bam"; | |
| close IN; | |
| @IN=(); | |
| open IN, $f =~ /\.bam$/ ? "samtools view $f |" : "$f"; | |
| } | |
| my (@m); | |
| sub saveout { | |
| my ($out, $mate, $nmo, $pos, $len, $isrev, $origid, $cig) = @_; | |
| if ($pe) { | |
| if ($mate == 0) { | |
| if ($m[1]) { | |
| # first pair is bwa's default | |
| my $f1 = shift @{$m[0]}; | |
| my $f2 = shift @{$m[1]}; | |
| if ($f1->[1] eq $f2->[1] ) { | |
| if (!($f1->[1] eq '*') && !($f2->[1] eq '*') && !($f1->[6] eq '*') && !($f2->[6] eq '*')) { | |
| if (((($f1->[2]+$f1->[3]) < $FAI{$f1->[1]}) && | |
| (($f2->[2]+$f2->[3]) < $FAI{$f2->[1]}))) { | |
| # only output proper pair, with '*' nmo's and cigs, and where both alignments are within the fai | |
| print OUT $f1->[0]; | |
| print OUT $f2->[0]; | |
| } else { | |
| # warn discard? | |
| } | |
| } else { | |
| if ($keep && (($f1->[1] eq '*') && ($f2->[1] eq '*'))) { | |
| print OUT $f1->[0]; | |
| print OUT $f2->[0]; | |
| } else { | |
| # warn discard | |
| } | |
| } | |
| } | |
| if (@{$m[0]} && @{$m[1]}) { | |
| # prune proper ... only matching mates | |
| my %have; | |
| for (@{$m[0]}) { | |
| next if $_->[6] eq '*'; | |
| next if $opt{fai} && (($_->[2]+$_->[3]) > $FAI{$_->[1]}); | |
| $have{$_->[1]}=1; | |
| } | |
| for (@{$m[1]}) { | |
| next if $_->[6] eq '*'; | |
| next if $opt{fai} && (($_->[2]+$_->[3]) > $FAI{$_->[1]}); | |
| $have{$_->[1]}=2 if $have{$_->[1]} == 1; | |
| } | |
| %have = map { $have{$_}==2 ? ($_=>1) : () } keys(%have); | |
| $have{'*'} = undef; | |
| my (@m1, @m0); | |
| # only keep good alignments | |
| for (@{$m[0]}) { | |
| next if $opt{fai} && (($_->[2]+$_->[3]) > $FAI{$_->[1]}); | |
| push @m0, $_ if $have{$_->[1]} && !($_->[6] eq '*'); | |
| } | |
| for (@{$m[1]}) { | |
| next if $opt{fai} && (($_->[2]+$_->[3]) > $FAI{$_->[1]}); | |
| push @m1, $_ if $have{$_->[1]} && !($_->[6] eq '*'); | |
| } | |
| @{$m[0]}=@m0; | |
| @{$m[1]}=@m1; | |
| my %taken; | |
| @m0 = (); | |
| @m1 = (); | |
| # now pick best mate from the pruned set (first part above not really necessary, but might speed things up) | |
| for my $a (@{$m[0]}) { | |
| my $min = 1000000; | |
| my $best; | |
| for my $b (@{$m[1]}) { | |
| if ($a->[1] eq $b->[1]) { | |
| # distance closest to true insert size | |
| my $dist = abs(abs($a->[2]-$b->[2])+$b->[3]-$insert); | |
| if ($dist < $min && !$taken{scalar $b} && $dist < $insert) { | |
| # if distance no more than double insert size, then it's OK (bwa should filter, but doesn't always) | |
| $min=$dist; | |
| $best=$b; | |
| } | |
| } | |
| } | |
| if ($best) { | |
| # found a mate? output it | |
| $taken{scalar $best}=1; | |
| push @m0, $a; | |
| push @m1, $best; | |
| } | |
| } | |
| @{$m[0]}=@m0; | |
| @{$m[1]}=@m1; | |
| for (my $i=0;$i<@{$m[0]};++$i) { | |
| if ($m[0][$i][5] gt $m[1][$i][5]) { | |
| # swap mates so they are in the ORIGINAL-ID (read1, read2) order | |
| my @tmp = @{$m[0][$i]}; | |
| @{$m[0][$i]} = @{$m[1][$i]}; | |
| @{$m[1][$i]} = @tmp; | |
| } | |
| my ($out1, $nmo1, $pos1, $len1, $isrev1, $origid, $cig) = @{$m[0][$i]}; | |
| my ($out2, $nmo2, $pos2, $len2, $isrev2, $origid, $cig) = @{$m[1][$i]}; | |
| # replace insert-size and mate-pos (slow!) | |
| my $dist1 = $pos2-$pos1-$len2 if $isrev1 && $pos2 && $pos1; | |
| $dist1 = $pos2-$pos1+$len2 if !$isrev1 && $pos2 && $pos1; | |
| $out1 = replacetab($out1, 8, $dist1); | |
| $out1 = replacetab($out1, 7, $pos2); | |
| print OUT $out1; | |
| my $dist2 = -$dist1; | |
| $out2 = replacetab($out2, 8, $dist2); | |
| $out2 = replacetab($out2, 7, $pos1); | |
| print OUT $out2; | |
| } | |
| } | |
| @m = (); | |
| } | |
| } | |
| push @{$m[$mate]}, [$out, $nmo, $pos, $len, $isrev, $origid, $cig]; | |
| } else { | |
| # only print clearly good or clearly unaligned | |
| print OUT $out if ($keep && $nmo eq '*') || ((!($nmo eq '*')) && (!($cig eq '*')) && (($pos+$len) < $FAI{$nmo})); | |
| # warn discard? | |
| } | |
| } | |
| warn "Reading '$f', writing to '$o'\n" if $opt{debug}; | |
| #open OUT, "|samtools view -S -b - > $o 2> /dev/null"; | |
| open OUT, "|samtools view -S -b - > $o"; | |
| my $mate = 1; | |
| my $previd; | |
| while (1){ | |
| if (@IN) { | |
| # pop stack | |
| $_ = shift @IN; | |
| } else { | |
| $_ = <IN>; | |
| } | |
| if (!$_) { | |
| warn "Finished reading $f\n" if $opt{debug}; | |
| last; | |
| } | |
| next if /^\@PG/ && $opt{noindel}; | |
| print(OUT) && next if /^\@/; | |
| chomp; | |
| $mate = !$mate; | |
| my ($id, $bits, $nmo, $pos, $qual, $cig, $f1, $f2, $f3, $seq, $qseq, @fdx) = split /\t/, $_; | |
| my $origid = $id; | |
| if (!$pe && ($cig eq '*' || $nmo eq '*')) { | |
| print OUT $_, "\n" if $keep; | |
| next; | |
| } | |
| # ensure progression even if errors in code | |
| eval { | |
| if ($pe && $mate && $previd) { | |
| # paired-end read id's have to match each other ... IE: bowtie output | |
| $_ = replacetab($_, 0, $previd); | |
| $id = $previd; | |
| } | |
| # previd set | |
| $previd = $id; | |
| my ($xa, $nm); | |
| for (@fdx) { | |
| # get rid of XA tag | |
| if (s/^XA:Z://) { | |
| $xa = $_; | |
| $_ = ''; | |
| } | |
| # get rid of NM tag | |
| if (s/^NM:i://) { | |
| $nm = $_; | |
| $_ = ''; | |
| } | |
| } | |
| if ($opt{noindel}) { | |
| # clean cig : todo: maybe remove innacurate MD tag? or keep it because it *was* OK? | |
| $cig =~ s/(\d+)I/\1M/g; | |
| $cig =~ s/(\d+)D//g; | |
| $cig =~ s/(\d+)S/\1M/g; | |
| while ($cig =~ s/(\d+)M(\d+)M/($1+$2)."M"/e) {}; | |
| } | |
| $_=replacetab($_, 5, $cig); | |
| saveout($_."\n", $mate, $nmo, $pos, length($seq), $bits & 16, $origid, $cig) && next if !$xa; | |
| # remove XA tag | |
| $_ =~ s/\tXA:Z:[^\t]+//; | |
| saveout($_."\n", $mate, $nmo, $pos, length($seq), $bits & 16, $origid, $cig); | |
| my $isrev = $bits & 16; | |
| for (split /;/, $xa) { | |
| # for each x alignment | |
| my ($nmo, $pos, $cig, $mm) = m/(.*),([^,]+),([^,]+),([^,]+)/; | |
| # set reverse bits as appropriate | |
| $bits = $bits & ~16; | |
| $bits = $bits | 16 if $pos =~ /^-/; | |
| if ($pos =~ /^-/) { | |
| # mate not rev | |
| $bits = $bits & ~0x020; | |
| } else { | |
| # mate rev | |
| $bits = $bits | 0x020; | |
| } | |
| if ($pe) { | |
| $bits = $bits & ~0x040; | |
| $bits = $bits & ~0x080; | |
| $bits = $bits | 0x040 if !$mate; | |
| $bits = $bits | 0x080 if $mate; | |
| } | |
| my $s = $seq; | |
| my $q = $qseq; | |
| if ( ($bits & 16) != $isrev) { | |
| # reverse sequence if reverse alignment | |
| $s=revcomp($s); | |
| $q=reverse($q); | |
| } | |
| if ($opt{noindel}) { | |
| # clean cigar of all indels/soft-clips | |
| $cig =~ s/(\d+)I/\1M/g; | |
| $cig =~ s/(\d+)D//g; | |
| $cig =~ s/(\d+)S/\1M/g; | |
| while ($cig =~ s/(\d+)M(\d+)M/($1+$2)."M"/e) {}; | |
| } | |
| $pos =~ s/^[+-]//; | |
| my @tmp=@fdx; | |
| push @tmp, "NM:i:$nm"; | |
| @tmp = grep /:/, @tmp; | |
| my $fdx = join "\t", @tmp; | |
| # save output | |
| saveout("$id\t$bits\t$nmo\t$pos\t$qual\t$cig\t$f1\t$f2\t$f3\t$s\t$q\t$fdx\n", $mate, $nmo, $pos, length($seq), $bits & 16, $origid, $cig); | |
| } | |
| }} | |
| $mate = !$mate; | |
| saveout("",$mate) if $mate == 0; | |
| sub revcomp { | |
| my $r = reverse(shift @_); | |
| $r =~ tr/ACGT/TGCA/; | |
| return $r; | |
| } | |
| sub replacetab { | |
| my ($str, $p, $rep) = @_; | |
| $p; | |
| $str=~ s/((?:[^\t]+\t){$p})[^\t]+\t/$1$rep\t/; | |
| return $str; | |
| } | |