Skip to content

Commit

Permalink
Merge pull request #1222 from GMOD/arbitrary_fields_tabix
Browse files Browse the repository at this point in the history
Add arbitrary field indexing on gff3tabix tracks ref #1115
  • Loading branch information
rbuels committed Oct 9, 2018
2 parents 526a65b + 3a20779 commit f449965
Show file tree
Hide file tree
Showing 19 changed files with 92 additions and 11 deletions.
8 changes: 7 additions & 1 deletion docs/site/generate_names.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ Note that generate-names.pl does not require any arguments. However, some option

View bin/generate-names.pl --help for more options. Note that if you are getting 404 errors for names/root.json then JBrowse is falling back to the legacy names store (and failing) so it is likely that you need to retry generate-names.

Note: by defeault, the Name, ID, and Alias fields are indexed by generate-names.pl because those are what are specified to be the "names" of the features when you run flatfile-to-json.pl, however, if you run flatfile-to-json.pl with --nameAttributes "name,id,alias,gene_id" for example, then it will also load the "gene_id" field as a name, and then you can re-run generate-names.pl and the gene_id can be searched for.
## Indexing custom fields in GFF

By defeault, the Name, ID, and Alias fields are indexed by generate-names.pl

If you want to index more or different custom fields, you can run flatfile-to-json.pl (not generate-names.pl!) with --nameAttributes "name,id,alias,gene_id" for example, then it will also load the "gene_id" field as a name, and then you can re-run generate-names.pl and the gene_id can be searched for.

Also note that if you have a GFF3Tabix track (which is not loaded via flatfile-to-json.pl) then you can add a nameAttributes=name,id,alias,gene_id to the config for example


29 changes: 19 additions & 10 deletions src/perl5/Bio/JBrowse/Cmd/IndexNames.pm
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,10 @@ sub make_file_record {
$file =~ /\.vcf(\.gz)?$/ ? 'vcf' :
$file =~ /\.gff3?(\.gz)?(\.\d+)?$/ ? 'gff' :
undef;

if( $type ) {
return {
gzipped => $gzipped,
nameAttributes => $track->{nameAttributes},
fullpath => $file,
type => $type,
trackName => $track->{label}
Expand Down Expand Up @@ -374,9 +374,8 @@ sub find_names_files {
}

# try to detect VCF tracks and index their VCF files
if( $track->{storeClass}
&& ( $track->{urlTemplate} && $track->{urlTemplate} =~ /\.vcf\.gz/
|| $track->{storeClass} =~ /VCFTabix$/ )
if( $track->{urlTemplate} && $track->{urlTemplate} =~ /\.vcf\.gz/
|| ($track->{storeClass}||'') =~ /VCFTabix$/
) {
my $path = File::Spec->catfile( $self->opt('dir'), $track->{urlTemplate} );
if( -r $path ) {
Expand All @@ -388,9 +387,8 @@ sub find_names_files {
}

# try to detect GFF3 tracks and index their GFF3 files
if( $track->{storeClass}
&& ( $track->{urlTemplate} && $track->{urlTemplate} =~ /\.gff3?\.gz(\.\d+)?/
|| $track->{storeClass} =~ /GFF3Tabix$/ )
if( $track->{urlTemplate} && $track->{urlTemplate} =~ /\.gff3?\.gz(\.\d+)?/
|| ($track->{storeClass}||'') =~ /GFF3Tabix$/
) {
my $path = File::Spec->catfile( $self->opt('dir'), $track->{urlTemplate} );
if( -r $path ) {
Expand Down Expand Up @@ -587,14 +585,14 @@ sub make_names_iterator {

my ( $ref, $start, $name, $basevar ) = split "\t", $line, 5;
$start--;
return [[$name],$file_record->{trackName},$name,$ref, $start, $start+length($basevar)];
my @names = split /\s*;\s*/, $name;
return [\@names,$file_record->{trackName},$name,$ref, $start, $start+length($basevar)];
};
}
elsif( $file_record->{type} eq 'gff' ) {
my $input_fh = $self->open_names_file( $file_record );
no warnings 'uninitialized';
return sub {

# find the next feature in the file that has a name
my $line;
my $feature;
Expand All @@ -605,7 +603,18 @@ sub make_names_iterator {
$feature = gff3_parse_feature($line);
my $Name = $feature->{attributes}{Name} || [];
my $ID = $feature->{attributes}{ID} || [];
@names = $Name->[0] ? (@$Name, @$ID) : @$ID;
my $Alias = $feature->{attributes}{Alias} || [];
my @fields;
my @computedFields;
if(ref(\$file_record->{nameAttributes}) eq 'ARRAY') {
@fields = $file_record->{nameAttributes}
} elsif(ref(\$file_record->{nameAttributes}) eq 'SCALAR') {
@fields = split /\s*,\s*/, $file_record->{nameAttributes};
}
if(@fields) {
@computedFields = map { $feature->{attributes}{$_} || [] } @fields;
}
@names = @fields ? @computedFields : $Name->[0] ? (@$Name, @$ID) : @$ID;
last if scalar @names;
}
}
Expand Down
10 changes: 10 additions & 0 deletions tests/data/volvox_tabix_names/.htaccess
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# This Apache .htaccess file is generated by JBrowse (GenomeDB) for
# allowing cross-origin requests as defined by the Cross-Origin
# Resource Sharing working draft from the W3C
# (http://www.w3.org/TR/cors/). In order for Apache to pay attention
# to this, it must have mod_headers enabled, and its AllowOverride
# configuration directive must allow FileInfo overrides.
<IfModule mod_headers.c>
Header onsuccess set Access-Control-Allow-Origin *
Header onsuccess set Access-Control-Allow-Headers X-Requested-With,Range
</IfModule>
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/0.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"val2":{"prefix":[],"exact":[["val2",0,"ctgA","ctgA",0,50001]]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"va":{"prefix":["val1","val2","val3"],"exact":[]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ctg":{"prefix":["ctgB","ctgA"],"exact":[]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/3.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ctga":{"prefix":[],"exact":[["ctgA",50001,"ctgA",null,0,50001,20000]]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/4.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"v":{"prefix":["val1","val2","val3"],"exact":[]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/5.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"val":{"exact":[],"prefix":["val1","val2","val3"]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/6.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"val3":{"prefix":[],"exact":[["val3",0,"ctgA","ctgA",0,50001]]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/9.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ctgb":{"exact":[["ctgB",6079,"ctgB",null,0,6079,20000]],"prefix":[]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/a.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"val1":{"prefix":[],"exact":[["val1",0,"ctgA","ctgA",0,50001]]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/e.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ct":{"exact":[],"prefix":["ctgB","ctgA"]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/f.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"c":{"exact":[],"prefix":["ctgB","ctgA"]}}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/names/meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"compress":0,"lowercase_keys":1,"hash_bits":"4","format":"json","track_names":["volvox_gff3_tabix"]}
1 change: 1 addition & 0 deletions tests/data/volvox_tabix_names/seq/refSeqs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"length":6079,"name":"ctgB","seqChunkSize":20000,"end":6079,"start":0},{"length":50001,"name":"ctgA","seqChunkSize":20000,"end":50001,"start":0}]
17 changes: 17 additions & 0 deletions tests/data/volvox_tabix_names/trackList.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"formatVersion" : 1,
"names" : {
"type" : "Hash",
"url" : "names/"
},
"tracks" : [
{
"label" : "volvox_gff3_tabix",
"nameAttributes" : "multivalue",
"storeClass" : "JBrowse/Store/SeqFeature/GFF3Tabix",
"tbiUrlTemplate" : "volvox.sort.gff3.gz.tbi",
"type" : "CanvasFeatures",
"urlTemplate" : "volvox.sort.gff3.gz.1"
}
]
}
Empty file.
26 changes: 26 additions & 0 deletions tests/perl_tests/generate-names.pl.t
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ run_with (
my @files = glob("$tempdir/names/*");
is( scalar @files, 17 , 'the dir has some stuff in it' );

$tempdir = new_volvox_sandbox_tabix_nameattributes();
run_with (
'--dir' => "$tempdir",
'--tracks' => 'volvox_gff3_tabix'
);
{
my $got = read_names($tempdir);
my $expected = read_names('tests/data/volvox_tabix_names');
is_deeply( $got, $expected, 'same data using tabix nameAttributes config' );
# or diag explain read_names($tempdir);
}

done_testing;

sub read_names {
Expand All @@ -92,3 +104,17 @@ sub new_volvox_sandbox {
rmtree( "$tempdir/names" );
return $tempdir;
}

sub new_volvox_sandbox_tabix_nameattributes {
my $tempdir = File::Temp->newdir( CLEANUP => $ENV{KEEP_ALL} ? 0 : 1 );
print STDERR $tempdir."\n";
dircopy( 'tests/data/volvox_tabix_names', $tempdir );
copy( 'sample_data/raw/volvox/volvox.sort.gff3.gz.1',
"$tempdir/volvox.sort.gff3.gz.1"
) or die $!;
copy( 'sample_data/raw/volvox/volvox.sort.gff3.gz.tbi',
"$tempdir/volvox.sort.gff3.gz.tbi"
) or die $!;
rmtree( "$tempdir/names" );
return $tempdir;
}

0 comments on commit f449965

Please sign in to comment.