Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/usr/bin/perl | |
| #Copyright (c) 2012 Erik Aronesty (erik@q32.com) | |
| # | |
| #Permission is hereby granted, free of charge, to any person obtaining a copy | |
| #of this software and associated documentation files (the "Software"), to deal | |
| #in the Software without restriction, including without limitation the rights | |
| #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| #copies of the Software, and to permit persons to whom the Software is | |
| #furnished to do so, subject to the following conditions: | |
| # | |
| #The above copyright notice and this permission notice shall be included in | |
| #all copies or substantial portions of the Software. | |
| # | |
| #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| #THE SOFTWARE. | |
| # | |
| #ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT. | |
| use strict; | |
| use Getopt::Long; | |
| use IO::File; | |
| use Data::Dumper; | |
| use Carp qw(croak); | |
| my %opt; | |
| GetOptions(\%opt, "gtf=s") || die usage(); | |
| die usage() if !$opt{gtf}; | |
| # open gtf | |
| open(G, $opt{gtf}) || die("$opt{gtf}: $!\n"); | |
| my $f = shift @ARGV; | |
| # open counts | |
| open(IN, $f) || die "$f: $!\n"; | |
| # read gtf into hash | |
| my %gs; | |
| my %tids; | |
| while(<G>) { | |
| my @fds = split /\t/, $_; | |
| my ($g) = $fds[8]=~/gene_id "([^"]+)"/; | |
| my ($t) = $fds[8]=~/transcript_id "([^"]+)"/; | |
| $gs{$t}=$g; | |
| $tids{$g}{$t}=1; | |
| } | |
| die "gtf file has no genes\n" unless %gs; | |
| my $head1 = <IN>; chomp $head1; | |
| my $format = $head1 =~ /est_counts/ ? "express" : | |
| $head1 =~ /^\S*:\S+\t\d+\t\d+\t\d/ ? "coverage-matrix" : | |
| undef; | |
| die "File is not known express 1.x, or sam-stats coverage-matrix output\n" unless $format; | |
| close IN; | |
| for (keys(%tids)) { | |
| $tids{$_}=join ',', keys(%{$tids{$_}}); | |
| } | |
| my @fields = (); | |
| if ($format eq 'coverage-matrix') { | |
| @fields = (fields=>["transcript_id", "length", "count", "coverage", "skew"]); | |
| } | |
| my %agg; | |
| my $tot_count; | |
| xsvparse($f, sub => sub { | |
| my ($d) = @_; | |
| if ($format eq 'express') { | |
| my $g = $gs{$d->{target_id}}; | |
| return unless $g; | |
| $agg{$g}{fpkm}+=$d->{fpkm}; | |
| $agg{$g}{eff_length}+=$d->{eff_length}*($d->{est_counts}+1); | |
| $agg{$g}{length}+=$d->{length}*($d->{est_counts}+1); | |
| $agg{$g}{est_counts}+=$d->{est_counts}; | |
| $agg{$g}{eff_counts}+=$d->{eff_counts}; | |
| $agg{$g}{junk_counts}+=$d->{est_counts}+1; | |
| $agg{$g}{fpkm_conf_high}+=$d->{fpkm_conf_high}; | |
| $agg{$g}{fpkm_conf_low}+=$d->{fpkm_conf_low}; | |
| $agg{$g}{transcript_ids}=$tids{$g}; | |
| } else { | |
| $tot_count+=$d->{count}; | |
| my $g = $gs{$d->{transcript_id}}; | |
| return unless $g; | |
| $agg{$g}{fpkm}+=1000*$d->{count}/$d->{length}; | |
| $agg{$g}{count}+=$d->{count}; | |
| $agg{$g}{length}+=$d->{length}*($d->{count}+1); | |
| $agg{$g}{transcript_ids}=$tids{$g}; | |
| $agg{$g}{junk_counts}+=$d->{count}+1; | |
| } | |
| },nocase=>1, @fields); | |
| my @fds = qw(gene_id est_counts eff_counts length eff_length fpkm fpkm_conf_low fpkm_conf_high transcript_ids); | |
| if ($format eq 'coverage-matrix') { | |
| @fds = qw(gene_id count length fpkm transcript_ids); | |
| } | |
| print join("\t", @fds), "\n"; | |
| for my $g (sort {$agg{$b}->{fpkm} <=> $agg{$a}->{fpkm}} keys(%agg) ) { | |
| $agg{$g}{gene_id}=$g; | |
| $agg{$g}{eff_length}/=$agg{$g}{junk_counts}; | |
| $agg{$g}{length}/=$agg{$g}{junk_counts}; | |
| if ($format eq 'coverage-matrix') { | |
| $agg{$g}{fpkm}/=($tot_count/1000000); | |
| } | |
| print join("\t", map {$agg{$g}{$_}} @fds), "\n"; | |
| } | |
| sub usage { | |
| <<EOF | |
| usage: $0 -g gtf_file IN1 IN2 ... | |
| Aggregates isoform counts by the gene information in a GTF file gene info | |
| Works with express-1.X output, or ea-utils/sam-stats coverage-matrix output. | |
| EOF | |
| } | |
| # this ia a really cool function that turns a csv file into a hash table | |
| # it's not "complete" but it works well... and at least i understand it | |
| # usage: xsvparse($file, option1=>val, option1=>val) | |
| # | |
| # all options are optional... by defualt it guesses the quote char, and the delimiter, and returns an array of hashes | |
| # if you specify a key, then it will return a hash of hashes indexed by that key | |
| # for example xsvparse("Summary.tsv", key=lane); | |
| # OPTIONS: | |
| # nocase BOOL : lowercases all keys and field names | |
| # skip INT|REGEX : skips lines of file before reading | |
| # fields AREF : reference to an array of field names, for files without headers | |
| # key NAME : if specified, you get a hash of hashes indexed by a key field | |
| # delim TEXT : if specified, the delimiter to use (otherwise autodetect) | |
| # quot TEXT : if specified, the quot char to use (otherwise autodetect) | |
| # multi BOOL : if specified, each keyed entry is an arrayref of all matches | |
| # sub FUNCREF : if specified, each row is passed to the funcref | |
| sub xsvparse { | |
| my ($file, %op) = @_; | |
| my $in = new IO::File; | |
| my $fin = $file =~ /\.gz$/ ? "gunzip -c $file|" : $file; | |
| open ($in, $fin) || return undef; | |
| lcasehash(\%op); | |
| my $l1; | |
| if (!$op{delim}) { | |
| $l1 = <$in>; | |
| $l1 =~ s/[\r\n]+$//o; | |
| } | |
| my $skip_me = sub { | |
| if ($op{skip}) { | |
| if (ref($op{skip}) eq 'Regexp') { | |
| do { | |
| $l1 = <$in>; $l1 =~ s/[\r\n]+$//o; | |
| } while ($l1 =~ $op{skip}); | |
| } else { | |
| my $skip = $op{skip}; | |
| while ($skip > 0) { | |
| $l1 = <$in>; $l1 =~ s/[\r\n]+$//o; | |
| --$skip; | |
| } | |
| } | |
| } | |
| }; | |
| &$skip_me(); | |
| return undef unless $l1 || $op{delim}; | |
| local $_; | |
| if (!$op{delim}) { | |
| my $n = 0; | |
| my $d; | |
| for ("\t", ",", "|") { | |
| my $c = $_ eq "\t" ? $l1 =~ tr/\t// : $_ eq "," ? $l1 =~ tr/,// : $l1 =~ tr/|//; | |
| if ($c > $n) {$d = $_; $n = $c}; | |
| } | |
| $op{delim} = $d if $n > 0; | |
| croak "xsvparse can't determine delimiter for $file" if $n == 0; | |
| if ($op{fields}) { | |
| close($in); | |
| open($in, $fin); | |
| &$skip_me(); | |
| } | |
| } | |
| my $rxm = undef; | |
| if (!$op{quot}) { | |
| my $n = $l1 =~ tr/\"//; | |
| if ($n > 1) { | |
| $op{quot} = '"'; | |
| } | |
| } | |
| if ($op{quot}) { | |
| # quoted csv parser... slower, but works fine | |
| $rxm = qr{$op{quot}([^$op{quot}\\]*(?:\\.[^$op{quot}\\]*)*)$op{quot}$op{delim}?|([^$op{delim}]+)$op{delim}?|$op{delim}}; | |
| } | |
| if ($op{fields}) { | |
| if (!ref($op{fields}) eq 'ARRAY') { | |
| die "list of field names should be an array"; | |
| } | |
| } else { | |
| if (!$op{fields}) { | |
| $l1 = <$in> if ! defined $l1; | |
| if ($rxm) { | |
| @{$op{fields}} = (); | |
| push(@{$op{fields}} ,$+) while $l1 =~ m/$rxm/gx; | |
| } else { | |
| $op{fields} = [split($op{delim}, $l1)]; | |
| map {s/\s+$//} @{$op{fields}}; | |
| } | |
| } | |
| if ($op{rnames}) { | |
| $op{fields} = ['<key>', @{$op{fields}}] unless $op{fields}->[0] eq ''; | |
| $op{fields}->[0]=$op{key}='<key>'; | |
| } | |
| if ($op{nocase}) { | |
| for (@{$op{fields}}) { | |
| $_=lc($_); | |
| } | |
| } | |
| } | |
| if ($op{nocase} && $op{key}) { | |
| $op{key} = lc($op{key}); | |
| } | |
| no warnings; | |
| my $r; | |
| my @d; | |
| my @d; | |
| while (<$in>) { | |
| s/[\r\n]+$//o; | |
| if ($rxm) { | |
| @d = (); | |
| push(@d ,$+) while m/$rxm/gx; | |
| push(@d,undef) if substr($_, -1,1) eq $op{delim}; | |
| } else { | |
| @d = split($op{delim}, $_); | |
| } | |
| my %h; | |
| for (my $i = 0; $i < @{$op{fields}}; ++$i) { | |
| $h{$op{fields}->[$i]}=$d[$i]; | |
| } | |
| if ($op{sub}) { | |
| last if &{$op{sub}}(\%h) eq 'last'; | |
| } elsif ($op{key}) { | |
| if ($op{nocase}) { | |
| if ($op{multi}) { | |
| push @{$r->{lc($h{$op{key}})}}, \%h; | |
| } else { | |
| $r->{lc($h{$op{key}})}=\%h; | |
| } | |
| } else { | |
| if ($op{multi}) { | |
| push @{$r->{$h{$op{key}}}}, \%h; | |
| } else { | |
| $r->{$h{$op{key}}}=\%h; | |
| } | |
| } | |
| } else { | |
| push @{$r}, \%h; | |
| } | |
| } | |
| return $r; | |
| } | |
| sub lcasehash { | |
| my $h = shift; | |
| for (keys(%$h)) { | |
| if (! $_ eq lc($_) ) { | |
| $h->{lc($_)}=$h->{$_}; | |
| delete $h->{$_}; | |
| } | |
| } | |
| } | |