Skip to content

Commit

Permalink
refactored flatfile-to-json.pl to use Bio::GFF3::LowLevel::Parser,
Browse files Browse the repository at this point in the history
which is not broken for files with sync marks
  • Loading branch information
rbuels committed Mar 22, 2012
1 parent 229478e commit 5e0feda
Show file tree
Hide file tree
Showing 7 changed files with 465 additions and 291 deletions.
288 changes: 8 additions & 280 deletions bin/flatfile-to-json.pl
@@ -1,5 +1,13 @@
#!/usr/bin/env perl

use FindBin qw($RealBin);
use lib "$RealBin/../lib";
use Script::FlatFileToJson;

exit Script::FlatFileToJson->new(@ARGV)->run;

__END__
=head1 NAME
flatfile-to-json.pl - format data into JBrowse JSON format from an annotation file
Expand Down Expand Up @@ -146,283 +154,3 @@ =head2 BED-SPECIFIC
=back
=cut

use strict;
use warnings;

use FindBin qw($Bin);

use Getopt::Long;
use Pod::Usage;

use Bio::FeatureIO;

use lib "$Bin/../lib";
use ArrayRepr;
use GenomeDB;
use BioperlFlattener;
use ExternalSorter;
use JSON 2;

my ($gff, $bed, $bam, $trackLabel, $key,
$urlTemplate, $subfeatureClasses, $arrowheadClass,
$clientConfig, $thinType, $thickType,
$types, $nclChunk);
$types = [];
my $autocomplete = "none";
my $outdir = "data";
my $cssClass = "feature";
my ($getType, $getPhase, $getSubs, $getLabel, $compress) = (0, 0, 0, 0, 0);
my $sortMem = 1024 * 1024 * 512;
my $help;
GetOptions("gff=s" => \$gff,
"bed=s" => \$bed,
"bam=s" => \$bam,
"out=s" => \$outdir,
"tracklabel|trackLabel=s" => \$trackLabel,
"key=s" => \$key,
"cssClass=s" => \$cssClass,
"autocomplete=s" => \$autocomplete,
"getType" => \$getType,
"getPhase" => \$getPhase,
"getSubs|getSubfeatures" => \$getSubs,
"getLabel" => \$getLabel,
"urltemplate=s" => \$urlTemplate,
"arrowheadClass=s" => \$arrowheadClass,
"subfeatureClasses=s" => \$subfeatureClasses,
"clientConfig=s" => \$clientConfig,
"thinType=s" => \$thinType,
"thickType=s" => \$thickType,
"type=s@" => \$types,
"nclChunk=i" => \$nclChunk,
"compress" => \$compress,
"sortMem=i" =>\$sortMem,
"help|h|?" => \$help,
);
@$types = split /,/, join( ',', @$types);

pod2usage( -verbose => 2 ) if $help;

my $gdb = GenomeDB->new( $outdir );

pod2usage( "Must provide a --tracklabel parameter." ) unless defined $trackLabel;
pod2usage( "You must supply either a --gff or --bed parameter." )
unless defined $gff || defined $bed || defined $bam;

$bam and die "BAM support has been moved to a separate program: bam-to-json.pl\n";

if (!defined($nclChunk)) {
# default chunk size is 50KiB
$nclChunk = 50000;
# $nclChunk is the uncompressed size, so we can make it bigger if
# we're compressing
$nclChunk *= 4 if $compress;
}

my $idSub = sub {
return $_[0]->load_id if $_[0]->can('load_id') && defined $_[0]->load_id;
return $_[0]->can('primary_id') ? $_[0]->primary_id : $_[0]->id;
};

my $stream;
my $labelStyle = 1;
if ($gff) {
my $io = Bio::FeatureIO->new(
-format => 'gff',
-version => '3',
-file => $gff,
);
$stream = sub { $io->next_feature_group };
} elsif ($bed) {
my $io = Bio::FeatureIO->new(
-format => 'bed',
-file => $bed,
($thinType ? ("-thin_type" => $thinType) : ()),
($thickType ? ("-thick_type" => $thickType) : ()),
);
$labelStyle = sub {
#label sub for features returned by Bio::FeatureIO::bed
return $_[0]->name;
};
$stream = sub { $io->next_feature };
} else {
die "Please specify --gff or --bed.\n";
}

$clientConfig = JSON::from_json( $clientConfig )
if defined $clientConfig;

$subfeatureClasses = JSON::from_json($subfeatureClasses)
if defined $subfeatureClasses;

my %config = (
autocomplete => $autocomplete,
type => $getType || @$types ? 1 : 0,
phase => $getPhase,
subfeatures => $getSubs,
style => {
%{ $clientConfig || {} },
className => $cssClass,
( $arrowheadClass ? ( arrowheadClass => $arrowheadClass ) : () ),
( $subfeatureClasses ? ( subfeatureClasses => $subfeatureClasses ) : () ),
},
key => defined($key) ? $key : $trackLabel,
compress => $compress,
urlTemplate => $urlTemplate,
);


my $flattener = BioperlFlattener->new(
$trackLabel,
{
"idSub" => $idSub,
"label" => ($getLabel || ($autocomplete ne "none"))
? $labelStyle : 0,
%config,
},
[], [] );

# The ExternalSorter will get [chrom, [start, end, ...]] arrays from
# the flattener
my $sorter = ExternalSorter->new(
do {
my $startIndex = BioperlFlattener->startIndex;
my $endIndex = BioperlFlattener->endIndex;
sub ($$) {
$_[0]->[0] cmp $_[1]->[0]
||
$_[0]->[1]->[$startIndex] <=> $_[1]->[1]->[$startIndex]
||
$_[1]->[1]->[$endIndex] <=> $_[0]->[1]->[$endIndex];
}
},
$sortMem
);

my @arrayrepr_classes = (
{
attributes => $flattener->featureHeaders,
isArrayAttr => { Subfeatures => 1 },
},
{
attributes => $flattener->subfeatureHeaders,
isArrayAttr => {},
},
);

# build a filtering subroutine for the features
my $filter = make_feature_filter( $types );

my %featureCounts;
while ( my @feats = $filter->( $stream->() ) ) {
for my $feat ( @feats ) {
my $chrom = ref $feat->seq_id ? $feat->seq_id->value : $feat->seq_id;
$featureCounts{$chrom} += 1;

my $row = [ $chrom,
$flattener->flatten_to_feature( $feat ),
$flattener->flatten_to_name( $feat, $chrom ),
];
$sorter->add( $row );
}
}
$sorter->finish();

################################

my $track = $gdb->getTrack( $trackLabel )
|| $gdb->createFeatureTrack( $trackLabel,
\%config,
$config{key},
);

my $curChrom = 'NONE YET';
my $totalMatches = 0;
while( my $feat = $sorter->get ) {

unless( $curChrom eq $feat->[0] ) {
$curChrom = $feat->[0];
$track->finishLoad; #< does nothing if no load happening
$track->startLoad( $curChrom,
$nclChunk,
\@arrayrepr_classes,
);
}
$totalMatches++;
$track->addSorted( $feat->[1] );

# load the feature's name record into the track if necessary
if( my $namerec = $feat->[2] ) {
$track->nameHandler->addName( $namerec );
}
}

$gdb->writeTrackEntry( $track );

# If no features are found, check for mistakes in user input
if( !$totalMatches && defined $types ) {
warn "WARNING: No matching features found for @$types\n";
}


################

sub make_feature_filter {

my @filters;

# add a filter for type:source if --type was specified
if( $types && @$types ) {
my @type_regexes = map {
my $t = $_;
$t .= ":.*" unless $t =~ /:/;
qr/^$t$/
} @$types;

push @filters, sub {
my ($f) = @_;
my $type = $f->primary_tag
or return 0;
my $source = $f->source_tag;
my $t_s = "$type:$source";
for( @type_regexes ) {
return 1 if $t_s =~ $_;
}
return 0;
};
}

# if no filtering, just return a pass-through now.
return sub { @_ } unless @filters;

# make a sub that tells whether a single feature passes
my $pass_feature = sub {
my ($f) = @_;
$_->($f) || return 0 for @filters;
return 1;
};

# Apply this filtering rule through the whole feature hierarchy,
# returning features that pass. If a given feature passes, return
# it *and* all of its subfeatures, with no further filtering
# applied to the subfeatures. If a given feature does NOT pass,
# search its subfeatures to see if they do.
return sub {
_find_passing_features( $pass_feature, @_ );
}
};

# given a subref that says whether an individual feature passes,
# return the LIST of features among the whole feature hierarchy that
# pass the filtering rule
sub _find_passing_features {
my $pass_feature = shift;
return map {
my $feature = $_;
$pass_feature->( $feature )
# if this feature passes, we're done, just return it
? ( $feature )
# otherwise, look for passing features in its subfeatures
: _find_passing_features( $pass_feature, $feature->get_SeqFeatures );
} @_;
}
56 changes: 56 additions & 0 deletions lib/Script.pm
@@ -0,0 +1,56 @@
package Script;
use strict;
use warnings;

use Getopt::Long ();
use Pod::Usage ();

=head1 NAME
Script - base class for a JBrowse command-line script
=head1 DESCRIPTION
This wheel is smaller than the ones on CPAN, but not really rounder.
=cut

sub new {
my $class = shift;
my $opts = $class->getopts(@_);
return bless { opt => $opts }, $class;
}

sub getopts {
my $class = shift;
my $opts = {
$class->option_defaults,
};
local @ARGV = @_;
Getopt::Long::GetOptions( $opts, $class->option_definitions );
Pod::Usage::pod2usage( -verbose => 2 ) if $opts->{help};
return $opts;
}

sub opt {
if( @_ > 2 ) {
return $_[0]->{opt}{$_[1]} = $_[2];
} else {
return $_[0]->{opt}{$_[1]}
}
}

#override me
sub option_defaults {
( )
}

#override me
sub option_definitions {
( "help|h|?" )
}

sub run {
}

1;

0 comments on commit 5e0feda

Please sign in to comment.