Skip to content

Commit

Permalink
make flatfile-to-json load all attributes in a gff3 or bam file, part…
Browse files Browse the repository at this point in the history
…ially addressing #72
  • Loading branch information
rbuels committed May 29, 2012
1 parent dcad190 commit 26fbef7
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 29 deletions.
12 changes: 1 addition & 11 deletions lib/Bio/JBrowse/Cmd/FlatFileToJson.pm
Original file line number Diff line number Diff line change
Expand Up @@ -127,16 +127,6 @@ sub run {
$self->opt('sortMem'),
);

my @arrayrepr_classes = (
{
attributes => $feature_stream->featureHeaders,
isArrayAttr => { Subfeatures => 1 },
},
{
attributes => $feature_stream->subfeatureHeaders,
isArrayAttr => {},
},
);

# build a filtering subroutine for the features
my $filter = $self->make_feature_filter( $types );
Expand Down Expand Up @@ -174,7 +164,7 @@ sub run {
$track->finishLoad; #< does nothing if no load happening
$track->startLoad( $curChrom,
$self->opt('nclChunk'),
\@arrayrepr_classes,
$feature_stream->arrayReprClasses,
);
}
$totalMatches++;
Expand Down
54 changes: 46 additions & 8 deletions lib/Bio/JBrowse/Cmd/FlatFileToJson/FeatureStream.pm
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,31 @@ package Bio::JBrowse::Cmd::FlatFileToJson::FeatureStream;
use strict;
use warnings;

use Digest::MurmurHash ();

sub new {
my $class = shift;
bless { @_ }, $class;

my $self = bless {
@_,
class_count => 0
}, $class;

return $self;
}

sub flatten_to_feature {
my ( $self, $f, $class_index ) = @_;
my @f = ( $class_index || 0,
@{$f}{qw{ start end strand source phase type score }},
(map $f->{attributes}{$_}[0], qw(ID Name)),
[ map $self->flatten_to_feature($_,1), @{$f->{child_features}} ],
my ( $self, $f ) = @_;
my $subfeatures = [ map $self->flatten_to_feature($_), @{$f->{child_features}} ];

my $class = $self->_get_class( $f );

my @f = ( $class->{index},
@{$f}{ $self->_fixed_fields },
(map $f->{attributes}{$_}[0], @{$class->{fields}}),
$subfeatures
);

# convert start to interbase and numify it
$f[1] -= 1;
# numify end
Expand All @@ -31,6 +44,22 @@ sub flatten_to_feature {
return \@f;
}

sub _fixed_fields {
return qw{ start end strand source phase type score };
}

sub _get_class {
my ( $self, $f ) = @_;

my @attrs = keys %{$f->{attributes}};
my $attr_fingerprint = Digest::MurmurHash::murmur_hash( join '-', @attrs );

return $self->{classes}{$attr_fingerprint} ||= {
index => $self->{class_count}++, # the classes start from 1. so what.
fields => [ $self->_fixed_fields, @attrs],
};
}

sub flatten_to_name {
my ( $self, $f ) = @_;
my @namerec = (
Expand All @@ -44,9 +73,18 @@ sub flatten_to_name {
$namerec[4]--; #< to one-based
return \@namerec;
}
sub arrayReprClasses {
my ( $self ) = @_;
return [
map {
attributes => [ map ucfirst, @{$_->{fields}}, 'Subfeatures' ],
isArrayAttr => { Subfeatures => 1 }
},
sort { $a->{index} <=> $b->{index} }
values %{ $self->{classes} }
];
}

sub featureHeaders { [qw[ Start End Strand Source Phase Type Score Id Name Subfeatures ]] }
*subfeatureHeaders = \&featureHeaders;
sub startIndex { 1 }
sub endIndex { 2 }

Expand Down
20 changes: 10 additions & 10 deletions tests/perl_tests/flatfile-to-json.pl.t
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ sub tempdir {
scalar( @{$cds_trackdata->{histograms}{stats}}),
'have stats for each precalculated hist' );

is( ref $cds_trackdata->{intervals}{nclist}[2][10], 'ARRAY', 'exonerate mRNA has its subfeatures' )
is( ref $cds_trackdata->{intervals}{nclist}[2][18], 'ARRAY', 'exonerate mRNA has its subfeatures' )
or diag explain $cds_trackdata;
is( scalar @{$cds_trackdata->{intervals}{nclist}[2][10]}, 5, 'exonerate mRNA has 5 subfeatures' );
is( scalar @{$cds_trackdata->{intervals}{nclist}[2][18]}, 5, 'exonerate mRNA has 5 subfeatures' );

my $tracklist = $read_json->('trackList.json');
is_deeply( $tracklist->{tracks}[1]{style},
Expand Down Expand Up @@ -127,9 +127,9 @@ sub tempdir {
my $read_json = sub { slurp( $tempdir, @_ ) };
my $cds_trackdata = $read_json->(qw( tracks AU_mRNA Group1.33 trackData.json ));
is( $cds_trackdata->{featureCount}, 1, 'got right feature count' ) or diag explain $cds_trackdata;
is( ref $cds_trackdata->{intervals}{nclist}[0][10], 'ARRAY', 'mRNA has its subfeatures' )
is( ref $cds_trackdata->{intervals}{nclist}[0][18], 'ARRAY', 'mRNA has its subfeatures' )
or diag explain $cds_trackdata;
is( scalar @{$cds_trackdata->{intervals}{nclist}[0][10]}, 7, 'mRNA has 7 subfeatures' );
is( scalar @{$cds_trackdata->{intervals}{nclist}[0][18]}, 7, 'mRNA has 7 subfeatures' );

my $tracklist = $read_json->( 'trackList.json' );
is( $tracklist->{tracks}[0]{key}, 'AU mRNA', 'got a tracklist' ) or diag explain $tracklist;
Expand All @@ -155,9 +155,9 @@ sub tempdir {
# check that we got the same data as before
$cds_trackdata = $read_json->(qw( tracks AU_mRNA Group1.33 trackData.json ));
is( $cds_trackdata->{featureCount}, 1, 'got right feature count' ) or diag explain $cds_trackdata;
is( ref $cds_trackdata->{intervals}{nclist}[0][10], 'ARRAY', 'mRNA has its subfeatures' )
is( ref $cds_trackdata->{intervals}{nclist}[0][18], 'ARRAY', 'mRNA has its subfeatures' )
or diag explain $cds_trackdata;
is( scalar @{$cds_trackdata->{intervals}{nclist}[0][10]}, 7, 'mRNA has 7 subfeatures' );
is( scalar @{$cds_trackdata->{intervals}{nclist}[0][18]}, 7, 'mRNA has 7 subfeatures' );
}

{ #diag "running on single_au9_gene.gff3, testing that we emit 2 levels of subfeatures";
Expand All @@ -182,12 +182,12 @@ sub tempdir {
my $read_json = sub { slurp( $tempdir, @_ ) };
my $cds_trackdata = $read_json->(qw( tracks AU_mRNA Group1.33 trackData.json ));
is( $cds_trackdata->{featureCount}, 1, 'got right feature count' ) or diag explain $cds_trackdata;
is( ref $cds_trackdata->{intervals}{nclist}[0][10], 'ARRAY', 'gene has its subfeatures' )
is( ref $cds_trackdata->{intervals}{nclist}[0][17], 'ARRAY', 'gene has its subfeatures' )
or diag explain $cds_trackdata;
is( scalar @{$cds_trackdata->{intervals}{nclist}[0][10]}, 1, 'gene has 1 subfeature' );
is( ref $cds_trackdata->{intervals}{nclist}[0][10][0][10], 'ARRAY', 'mRNA has its subfeatures' )
is( scalar @{$cds_trackdata->{intervals}{nclist}[0][17]}, 1, 'gene has 1 subfeature' );
is( ref $cds_trackdata->{intervals}{nclist}[0][17][0][18], 'ARRAY', 'mRNA has its subfeatures' )
or diag explain $cds_trackdata;
is( scalar @{$cds_trackdata->{intervals}{nclist}[0][10][0][10]}, 7, 'mRNA has 7 subfeatures' );
is( scalar @{$cds_trackdata->{intervals}{nclist}[0][17][0][18]}, 7, 'mRNA has 7 subfeatures' );
}

for my $testfile ( "tests/data/au9_scaffold_subset.gff3", "tests/data/au9_scaffold_subset_sync.gff3" ) {
Expand Down

0 comments on commit 26fbef7

Please sign in to comment.