Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
480 lines (396 sloc) 10.7 KB
package Chrdex;
# Licensed via the "Artistic License"
# See: http://dev.perl.org/licenses/artistic.html
# Copyright 2011, Expression Analysis
# Author: Erik Aronesty <earonesty@expressionanalysis.com>
# "Let me know if it's useful"
#
# EXMAPLE:
# $x = Chrdex->new("CCDS_Exome_annot.txt", chr=>2, beg=>5, end=>6, skip=>1);
# $x->search(1, 153432255);
# TODO:
# work with ranges...should be easy
use Inline 'C';
use strict;
use warnings::register;
use Storable qw(store retrieve);
use Data::Dumper;
use locale; ##added by vjw to control case of reference bases
our $VERSION = '1.2.15'; # major = object interface is different, minor = new features, release=fixes/performance improve
my $FILEVER = 4; # only increment this if existing files won't work with the new version
my $tmpb;
sub new {
my ($class, $path, %opts) = @_;
if (ref($class)) {
$class = ref($class);
}
$opts{delim} = "\t" if ! $opts{delim};
$opts{skip} = 0 if ! $opts{skip};
$opts{chr} = 0 if ! $opts{chr};
$opts{beg} = 1 if ! $opts{beg};
$opts{end} = 2 if ! $opts{end};
$opts{ver} = $FILEVER;
if (! -s $path) { # be a little more careful about this one
if (! -s $path || -d $path) {
die "Can't open $path.\n";
}
}
# location of data store
my $annob = $path;
$annob =~ s/([^\/]+)$/\.$1/;
$annob = "$annob.chrdex";
$annob = $opts{index_path} if $opts{index_path};
my $ref;
my $mt = (stat($path))[9];
# if index is new
if (!$opts{force} && (stat($annob))[9] > $mt) {
$ref = eval {retrieve $annob};
# if arguments were different, then clear ref
for (qw(delim skip chr beg end ver byref)) {
last if !$ref;
$ref = undef if !($ref->{_opts}->{$_} eq $opts{$_});
}
if ($ref) {
# if begin != end, then type is range
if ($ref->{_opts}->{beg} != $ref->{_opts}->{end}) {
eval{chrdex_check($ref)};
if ($@) {
$ref = undef;
}
}
}
}
if (!$ref) {
my %locs;
$tmpb = "$annob.$$";
open( IN, $path ) or die "Can't open $path: $!\n";
my $skip = $opts{skip};
while ($skip > 0) { scalar <IN>; --$skip };
my $pos = 0;
$pos = tell IN if $opts{byref};
while(<IN>) {
my ($chr, $beg, $end);
$_ =~ s/\s+$//;
my @data = split /\t/;
$chr = $data[$opts{chr}];
$beg = $data[$opts{beg}];
$end = $data[$opts{end}];
if (!(($beg+0) eq $beg)) {
die "Invalid data in $path at line $., expected a number, got '$beg'\n";
}
$chr=~s/^chr//i;
$_ = $pos if $opts{byref};
# here's where you put the annotation info
if ($opts{beg} == $opts{end}) {
if ($locs{"$chr:$beg"}) {
push @{$locs{"$chr:$beg"}}, $_;;
} else {
$locs{"$chr:$beg"} = [$_];
}
} else {
push @{$locs{$chr}}, [$beg+0, $end+0, [$_]];
}
$pos = tell IN if $opts{byref};
}
close IN;
if ($opts{beg} == $opts{end}) {
goto DONE;
}
# sort & cache annotation, deal with overlaps nicely
my $i;
for my $chr (keys(%locs)) {
my $arr = $locs{$chr};
@{$locs{$chr}} = sort {$a->[0]-$b->[0]} @{$locs{$chr}};
for ($i=0;$i<$#{$arr};++$i) {
next unless $arr->[$i+1]->[0]; # empty? skip
if ($arr->[$i]->[1] >= $arr->[$i+1]->[0]) { # if i overlap the next one
# warn 1, Dumper($arr->[$i], $arr->[$i+1], $arr->[$i+2]);
# frag after next
my $new_st = $arr->[$i+1]->[1]+1;
my $new_en = $arr->[$i]->[1];
my $new_ro = $arr->[$i]->[2];
# TODO: store as array... string folding will save lots of space when there are many overlaps
# but hasn't been a problem so far
if ($arr->[$i]->[1] < $arr->[$i+1]->[1]) {
# overlap next
$new_st = $arr->[$i]->[1] + 1;
$new_en = $arr->[$i+1]->[1];
$new_ro = [@{$arr->[$i+1]->[2]}];
$arr->[$i+1]->[1] = $arr->[$i]->[1];
push @{$arr->[$i+1]->[2]}, @{$arr->[$i]->[2]};
} else {
push @{$arr->[$i+1]->[2]}, @{$arr->[$i]->[2]};
}
# shorten my end to less than the next's start
$arr->[$i]->[1] = $arr->[$i+1]->[0]-1;
if ($new_en >= $new_st) {
# warn "NEW: $new_st $new_en $new_ro\n";
# put the fragment where it belongs
my $j=$i+2;
while ($j<=$#{$arr} & $new_st > $arr->[$j]->[0]) {
++$j;
}
splice(@{$arr}, $j, 0, [$new_st, $new_en, $new_ro]);
}
if ($arr->[$i]->[1] < $arr->[$i]->[0]) {
splice(@{$arr}, $i, 1);
--$i;
}
# warn 2, Dumper($arr->[$i], $arr->[$i+1], $arr->[$i+2]);
}
}
}
DONE:
$locs{_opts} = \%opts;
$ref = \%locs;
store \%locs, "$tmpb";
rename "$tmpb", "$annob";
}
# stuff these into the top level, since tests showed it was significantly more expensive to doubly-reference
$ref->{_type}='C';
$ref->{_type}='I' if ($opts{beg} == $opts{end});
if (($ref->{_type} eq 'C')) {
chrdex_check($ref);
}
if ($opts{byref}) {
# only storing pointers to file, not whole record
require IO::File;
$ref->{_byref}=1;
$ref->{_refh} = new IO::File;
open($ref->{_refh}, $path) || die "Can't open $path\n";
}
bless $ref, $class;
return $ref;
}
END {
unlink("$tmpb.chrdex");
}
sub search {
return join "\n", query(@_);
}
sub query {
my ($self, $chr, $loc, $loc2) = @_;
$chr=~s/^chr//io;
my $type = $self->{_type};
my $list;
if ($type eq 'C') {
if ($loc2) {
$list = chrdex_search_range($self, $chr, $loc, $loc2);
return () if ! defined $list;
my $prev;
for (my $i = 0; $i < @$list; ++$i) {
if ($prev eq $list->[$i]) {
splice @$list, $i, 1;
--$i;
}
$prev = $list->[$i];
}
return @$list if !($self->{_byref});
} else {
$list = chrdex_search($self, $chr, $loc);
return () if ! defined $list;
return @$list if !($self->{_byref});
}
} elsif ($type eq 'I') {
if ($loc2) {
# if only this wasn't a hash.... sheesh
my %hv;
for (my $i=$loc;$i<$loc2;++$i) {
$list = $self->{"$chr:$loc"};
next unless defined $list;
for (@$list) {
$hv{$_}=1;
}
}
$list = [keys(%hv)];
} else {
$list = $self->{"$chr:$loc"};
return () if ! defined $list;
}
}
if ($self->{_byref}) {
for (@$list) {
seek $self->{_refh}, $_, 0;
$_=$self->{_refh}->getline();
chomp $_;
}
}
return @$list;
}
1;
__DATA__
__C__
bool get_sten(AV *arr, int i, int *st, int*en);
SV * av_fetch_2(AV *arr, int i, int j);
int chrdex_search_n(AV *arr, SV *schr, SV* sloc);
void chrdex_search(SV *self, SV *schr, SV* sloc) {
SV *roi;
AV *arr;
SV **pav;
HV *map= (HV*) SvRV(self);
char *chr = SvPV_nolen(schr);
int loc = SvIV(sloc);
pav = hv_fetch(map, chr, strlen(chr), 0);
if (!pav)
return;
arr = (AV*) SvRV(*pav);
int i = chrdex_search_n(arr, schr, sloc);
if (i >=0) {
roi = av_fetch_2(arr, i, 2);
if (!roi)
return;
Inline_Stack_Vars;
Inline_Stack_Reset;
Inline_Stack_Push(sv_2mortal(newSVsv(roi)));
Inline_Stack_Done;
Inline_Stack_Return(1);
}
return;
}
void chrdex_search_range(SV *self, SV *schr, SV* sloc, SV* eloc) {
SV *roi = NULL;
AV *arr;
SV **pav;
HV *map= (HV*) SvRV(self);
char *chr = SvPV_nolen(schr);
int isloc = SvIV(sloc);
int ieloc = SvIV(eloc);
pav = hv_fetch(map, chr, strlen(chr), 0);
if (!pav)
return;
arr = (AV*) SvRV(*pav);
int i = chrdex_search_n(arr, schr, sloc);
int j = chrdex_search_n(arr, schr, eloc);
if (!i) i=j;
if (!j) j=i;
if (i >=0) {
int x;
char *rx;
AV *rav=NULL;
for (x=i; x<=j; ++x) {
int st, en;
if (get_sten(arr, x, &st, &en)) {
if (ieloc >= st && isloc <= en) {
SV* ret = av_fetch_2(arr, x, 2);
if (ret) {
int z;
if (!rav) rav = newAV();
for (z=0;z<=av_len(SvRV(ret));++z) {
SV ** s = av_fetch(SvRV(ret), z, 0);
if (s) {
SvREFCNT_inc(*s);
av_push(rav, *s);
}
}
}
}
}
}
if (!rav)
return;
Inline_Stack_Vars;
Inline_Stack_Reset;
Inline_Stack_Push(newRV_noinc((SV*)rav));
Inline_Stack_Push(roi);
Inline_Stack_Done;
Inline_Stack_Return(1);
}
return;
}
int chrdex_search_n(AV *arr, SV *schr, SV* sloc) {
int b=0, t, i, st, en;
char *chr = SvPV_nolen(schr);
int loc = SvIV(sloc);
b = 0;
t = av_len(arr);
if (t <= b) {
get_sten(arr, i=0, &st, &en);
} else {
while (t > b) {
i = (t+b)/2;
if (!get_sten(arr, i, &st, &en))
return;
if ((i == b) || (i == t))
break;
if (loc > en) {
b = i;
} else if (loc < st) {
t = i;
} else {
break;
}
}
}
// printf("chr:%s loc: %d, st: %d en: %d i: %d t: %d b: %d\n", chr, loc, st, en, i, t, b);
if (loc < st) {
--i;
if (i < 0 || !get_sten(arr, i, &st, &en))
return;
} else if (loc > en) {
++i;
if (!get_sten(arr, i, &st, &en))
return;
}
if (loc >= st && loc <= en) {
return i;
}
return -1;
}
// doubly indexed array ... fetch 1, fetch 2
SV * av_fetch_2(AV *arr, int i, int j) {
SV **pav;
if (!(pav = av_fetch(arr,i,0)))
return &PL_sv_undef;
arr = (AV*) SvRV(*pav);
if (!(pav = av_fetch(arr,j,0)))
return &PL_sv_undef;
return *pav;
}
bool get_sten(AV *arr, int i, int *st, int*en) {
SV **pav;
if (!(pav = av_fetch(arr,i,0)))
return 0;
arr = (AV*) SvRV(*pav);
if (!(pav = av_fetch(arr,0,0)))
return 0;
*st = SvIV(*pav);
if (!(pav = av_fetch(arr,1,0)))
return 0;
*en = SvIV(*pav);
return 1;
}
void chrdex_check(SV *annoR) {
HE *he; // hash entry
SV *ent; // hash value
HV *hv; // annotation hash table
AV *av; // array in hash
SV **v; // entry in array
char *key;
int len;
if (!SvRV(annoR))
croak("annotation hash must be a reference");
annoR = SvRV(annoR);
if (SvTYPE(annoR) != SVt_PVHV)
croak("annotation array must be a hash ref");
hv = (HV *) annoR;
if (!hv_iterinit(hv)) {
croak("empty hash, fix that in perl");
}
he = hv_iternext(hv);
ent = hv_iternextsv(hv, &key, &len);
while (key && *key == '_') {
ent = hv_iternextsv(hv, &key, &len);
}
if ( SvTYPE(ent) != SVt_RV || (SvTYPE(SvRV(ent)) != SVt_PVAV) ) {
croak("each entry in the annotation hash must be a reference to an array");
}
av = (AV*) SvRV(ent);
v = av_fetch(av, 0, 0);
if (!v) {
croak("no empty annotation arrays, please");
}
if ( SvTYPE(*v) != SVt_RV || (SvTYPE(SvRV(*v)) != SVt_PVAV) ) {
croak("each entry in the array should contain a start and end region");
}
// ok.... reference should be safe enough not to segfault later
}