Skip to content

Commit

Permalink
refactor flat-file reformatting to remove redundant and empty tags
Browse files Browse the repository at this point in the history
  • Loading branch information
rbuels committed Nov 19, 2012
1 parent 5c74c8e commit 13a6ce9
Show file tree
Hide file tree
Showing 8 changed files with 220 additions and 54 deletions.
2 changes: 1 addition & 1 deletion src/perl5/Bio/JBrowse/Cmd/FlatFileToJson.pm
Expand Up @@ -207,7 +207,7 @@ sub _find_passing_features {
# if this feature passes, we're done, just return it
? ( $feature )
# otherwise, look for passing features in its subfeatures
: _find_passing_features( $pass_feature, @{$feature->{child_features}} );
: _find_passing_features( $pass_feature, @{$feature->{subfeatures}} );
} @_;
}

Expand Down
76 changes: 52 additions & 24 deletions src/perl5/Bio/JBrowse/FeatureStream.pm
Expand Up @@ -9,7 +9,7 @@ package Bio::JBrowse::FeatureStream;
use strict;
use warnings;

use Digest::MurmurHash ();
use List::MoreUtils 'uniq';

sub new {
my $class = shift;
Expand All @@ -24,16 +24,24 @@ sub new {

sub flatten_to_feature {
my ( $self, $f ) = @_;
my $subfeatures = [ map $self->flatten_to_feature($_), @{$f->{child_features}} ];

my $class = $self->_get_class( $f );

my @f = ( $class->{index},
@{$f}{ $self->_fixed_fields },
(map $f->{attributes}{$_}[0], @{$class->{variable_fields}}),
$subfeatures
@{$f}{ @{$class->{fields}} }
);

for my $subfeature_field (qw( subfeatures derived_features )) {
if( my $sfi = $class->{field_idx}{ $subfeature_field } ) {
$f[ $sfi+1 ] = [
map {
$self->flatten_to_feature($_)
} @{$f[$sfi+1]}
];
}
}
# use Data::Dump 'dump';
# print dump($_)."\n" for \@f, $class;

# convert start to interbase and numify it
$f[1] -= 1;
# numify end
Expand All @@ -45,33 +53,36 @@ sub flatten_to_feature {
return \@f;
}

sub _fixed_fields {
return qw{ start end strand source phase type score };
}

my %skip_field = map { $_ => 1 } qw( start end strand );
sub _get_class {
my ( $self, $f ) = @_;

my @attrs = keys %{$f->{attributes}};
my $attr_fingerprint = Digest::MurmurHash::murmur_hash( join '-', @attrs );

return $self->{classes}{$attr_fingerprint} ||= {
index => $self->{class_count}++, # the classes start from 1. so what.
fields => [ $self->_fixed_fields, @attrs],
fixed_fields => [ $self->_fixed_fields ],
variable_fields => \@attrs,
my @attrs = keys %$f;
my $attr_fingerprint = join '-', @attrs;

return $self->{classes}{$attr_fingerprint} ||= do {
my @fields = ( 'start', 'end', 'strand', ( grep !$skip_field{$_}, @attrs ) );
my $i = 0;
{
index => $self->{class_count}++, # the classes start from 1. so what.
fields => \@fields,
field_idx => { map { $_ => $i++ } @fields },
# assumes that if a field is an array for one feature, it will be for all of them
array_fields => [ grep ref($f->{$_}) eq 'ARRAY', @attrs ]
}
};
}

sub flatten_to_name {
my ( $self, $f ) = @_;
my @nameattrs = grep /^(name|id|alias)\d*$/, keys %$f;
my @namerec = (
[ grep defined, @{ $f->{attributes}{Name} || $f->{attributes}{ID} || [] }, @{$f->{attributes}{Alias}} ],
[ grep defined, @{$f}{@nameattrs} ],
$self->{track_label},
$f->{attributes}{Name}[0],
$f->{seq_id},
$f->{name},
$f->{seq_id} || die,
(map $_+0, @{$f}{'start','end'}),
$f->{attributes}{ID}[0],
$f->{id}
);
$namerec[4]--; #< to one-based
return \@namerec;
Expand All @@ -80,8 +91,8 @@ sub arrayReprClasses {
my ( $self ) = @_;
return [
map {
attributes => [ map ucfirst, @{$_->{fields}}, 'Subfeatures' ],
isArrayAttr => { Subfeatures => 1 }
attributes => [ map ucfirst, @{$_->{fields}} ],
isArrayAttr => { map { ucfirst($_) => 1 } @{$_->{array_fields}} },
},
sort { $a->{index} <=> $b->{index} }
values %{ $self->{classes} }
Expand All @@ -91,4 +102,21 @@ sub arrayReprClasses {
sub startIndex { 1 }
sub endIndex { 2 }


# given a hashref like { tagname => [ value1, value2 ], ... }
# flatten it to numbered tagnames like { tagname => value1, tagname2 => value2 }
sub _flatten_multivalues {
my ( $self, $h ) = @_;
my %flattened;

for my $key ( keys %$h ) {
my $v = $h->{$key};
for( my $i = 0; $i < @$v; $i++ ) {
$flattened{ $key.($i ? $i+1 : '')} = $v->[$i];
}
}

return \%flattened;
}

1;
45 changes: 33 additions & 12 deletions src/perl5/Bio/JBrowse/FeatureStream/BioPerl.pm
Expand Up @@ -10,6 +10,8 @@ use strict;
use warnings;
use base 'Bio::JBrowse::FeatureStream';

use List::MoreUtils 'uniq';

sub next_items {
my ( $self ) = @_;
return map $self->_bp_to_hashref( $_ ),
Expand All @@ -22,30 +24,49 @@ sub _bp_to_hashref {
no warnings 'uninitialized';

my %h;
@h{qw{ seq_id start end strand source phase type child_features }} =
@h{qw{ seq_id start end strand source phase type }} =
( $f->seq_id,
$f->start,
$f->end,
$f->strand,
$f->source_tag,
{0=>0,1=>1,2=>2}->{$f->phase},
$f->primary_tag || undef,
[ map $self->_bp_to_hashref($_), $f->get_SeqFeatures ],
);
for(qw( seq_id start end strand source type )) {
$h{$_} = undef if $h{$_} eq '.';
if( $h{$_} eq '.' ) {
delete $h{$_};
}
}
for ( keys %h ) {
if( ! defined $h{$_} ) {
delete $h{$_};
} else {
$h{$_} = [ $h{$_} ];
}
}
my @subfeatures = $f->get_SeqFeatures;
if( @subfeatures ) {
$h{subfeatures} = [[ map $self->_bp_to_hashref($_), @subfeatures ]];
}

for my $tag ( $f->get_all_tags ) {
my $lctag = lc $tag;
push @{ $h{ $lctag } ||= [] }, $f->get_tag_values($tag);
}

for ( keys %h ) {
$h{$_} = [ uniq grep { defined && ($_ ne '.') } @{$h{$_}} ];
unless( @{$h{$_}} ) {
delete $h{$_};
}
}
$h{attributes} = {
map {
my $t = $_;
$t => [ grep $_ ne '.', $f->get_tag_values($t) ]
} $f->get_all_tags
};

if( ! $h{attributes}{Name} and defined( my $label = $self->_label( $f ) )) {
$h{attributes}{Name} = [ $label ];
if( ! $h{name} and defined( my $label = $self->_label( $f ) )) {
$h{name} = [ $label ];
}
return \%h;

return $self->_flatten_multivalues( \%h );
};

sub _label {
Expand Down
51 changes: 49 additions & 2 deletions src/perl5/Bio/JBrowse/FeatureStream/GFF3_LowLevel.pm
Expand Up @@ -12,10 +12,57 @@ use warnings;
use base 'Bio::JBrowse::FeatureStream';

sub next_items {
while ( my $i = $_[0]->{parser}->next_item ) {
return $i if $i->{child_features};
my ( $self ) = @_;
while ( my $i = $self->{parser}->next_item ) {
return $self->_to_hashref( $i ) if $i->{child_features};
}
return;
}

sub _to_hashref {
my ( $self, $f ) = @_;
# use Data::Dump 'dump';
# if( ref $f ne 'HASH' ) {
# Carp::confess( dump $f );
# }
$f = { %$f };
my $a = delete $f->{attributes};
my %h;
for my $key ( keys %$f) {
my $lck = lc $key;
my $v = $f->{$key};
if( defined $v && ( ref($v) ne 'ARRAY' || @$v ) ) {
unshift @{ $h{ $lck } ||= [] }, $v;
}
}
# rename child_features to subfeatures
if( $h{child_features} ) {
$h{subfeatures} = [
map {
[ map $self->_to_hashref( $_ ), @$_ ]
} @{delete $h{child_features}}
];
}
if( $h{derived_features} ) {
$h{derived_features} = [
map {
[ map $self->_to_hashref( $_ ), @$_ ]
} @{$h{derived_features}}
];
}

my %skip_attributes = ( Parent => 1 );
for my $key ( sort keys %{ $a || {} } ) {
my $lck = lc $key;
if( !$skip_attributes{$key} ) {
push @{ $h{$lck} ||= [] }, @{$a->{$key}};
}
}

my $flat = $self->_flatten_multivalues( \%h );
return $flat;
}



1;
20 changes: 20 additions & 0 deletions tests/data/foo.bed
@@ -0,0 +1,20 @@
chr10 102746607 102747330 chr10_100010101 1 +
chr10 102756707 102757063 chr10_100020212 1 +
chr10 102757103 102757628 chr10_100020608 1 +
chr10 102758816 102759393 chr10_100022332 1 +
chr10 102759402 102759798 chr10_100023087 1 +
chr10 102763502 102763900 chr10_100027151 1 +
chr10 102778574 102779028 chr10_100042744 1 +
chr10 102790835 102791548 chr10_100059467 1 +
chr10 102807700 102808458 chr10_100076094 1 +
chr10 102809834 102810339 chr10_100078435 1 +
chr10 102820652 102821466 chr10_100089212 1 +
chr10 102821565 102821902 chr10_100090127 1 +
chr10 102822305 102822874 chr10_100090866 1 +
chr10 102825930 102826474 chr10_100094501 1 +
chr10 102826938 102827286 chr10_100095501 1 +
chr10 102827366 102827778 chr10_100095928 1 +
chr10 102883063 102883551 chr10_100154436 1 +
chr10 102891173 102891794 chr10_100162617 1 +
chr10 102893599 102894301 chr10_100165069 1 +
chr10 102905801 102906693 chr10_100177356 1 +
2 changes: 2 additions & 0 deletions tests/data/redundant.gff3
@@ -0,0 +1,2 @@
##gff-version 3
Group1.36 AU9 gene 176975 180744 0.84 + . ID=au9.g1002;Name=au9.g1002,foobar;score=20;start=99839;Alias=noggin
49 changes: 49 additions & 0 deletions tests/perl_tests/featurestream.t
@@ -0,0 +1,49 @@
use strict;
use warnings;

use JBlibs;

use Test::More;
use File::Temp;

use Bio::JBrowse::FeatureStream::GFF3_LowLevel;

sub tempdir {
my $tempdir = File::Temp->newdir( CLEANUP => $ENV{KEEP_ALL} ? 0 : 1 );
#diag "using temp dir $tempdir";
return $tempdir;
}


{
require Bio::GFF3::LowLevel::Parser;
my $p = Bio::GFF3::LowLevel::Parser->new( 'tests/data/redundant.gff3' );

my $s = Bio::JBrowse::FeatureStream::GFF3_LowLevel->new(
parser => $p,
track_label => 'faketracklabel'
);

my @i = $s->next_items;
is_deeply( \@i,
[
{
'alias' => 'noggin',
'end' => '180744',
'id' => 'au9.g1002',
'name' => 'au9.g1002',
'name2' => 'foobar',
'score' => '0.84',
'score2' => '20',
'seq_id' => 'Group1.36',
'source' => 'AU9',
'start' => '176975',
'start2' => '99839',
'strand' => '+',
'type' => 'gene'
}
]
) or diag explain \@i;
}

done_testing;

0 comments on commit 13a6ce9

Please sign in to comment.