diff --git a/bin/generate-names.pl b/bin/generate-names.pl index 1b6f34ebe4..ce13ea0fb7 100755 --- a/bin/generate-names.pl +++ b/bin/generate-names.pl @@ -38,15 +38,6 @@ =head1 OPTIONS built. If not passed, tries to estimate this based on the size of the input names files. -=item --incremental - -Add new entries to the names index, not deleting the old ones. When -using this option, it is best to pass the --totalNames parameter as -well. Otherwise, the first call to generate-names.pl will initialize -the names index to be optimal for the number of names added just in -that first call, which could lead to the names index being constructed -non-optimally. - =item --verbose Print more progress messages. @@ -82,13 +73,10 @@ =head1 OPTIONS use GenomeDB; -my %trackHash; my @includedTrackNames; -my @tracksWithNames; my $outDir = "data"; my $verbose = 0; -my $incremental; my $help; my $max_completions = 20; my $max_locations = 100; @@ -100,7 +88,6 @@ =head1 OPTIONS "locationLimit=i" => \$max_locations, "verbose+" => \$verbose, "thresh=i" => \$thresh, - "incremental" => \$incremental, "totalNames=i" => \$est_total_name_records, 'tracks=s' => \@includedTrackNames, 'hashBits=i' => \$hash_bits, @@ -148,17 +135,15 @@ =head1 OPTIONS #print STDERR "Names files:\n", map " $_->{fullpath}\n", @names_files; -unless( $incremental ) { - # estimate the total number of name records we probably have based on the input file sizes - $est_total_name_records ||= int( (sum( map { -s $_->{fullpath} } @names_files )||0) / 70 ); - if( $verbose ) { - print STDERR "Estimated $est_total_name_records total name records to index.\n"; - } +# estimate the total number of name records we probably have based on the input file sizes +$est_total_name_records ||= int( (sum( map { -s $_->{fullpath} } @names_files )||0) / 70 ); +if( $verbose ) { + print STDERR "Estimated $est_total_name_records total name records to index.\n"; } my $nameStore = Bio::JBrowse::HashStore->open( dir => catdir( $outDir, "names" ), - empty => !$incremental, + empty => 1, # set the hash size to try to get about 10 name records per file # (does not count prefix completions) if the store has existing @@ -181,7 +166,11 @@ =head1 OPTIONS for my $ref ( @refSeqs ) { push @namerecord_buffer, [ @{$ref}{ qw/ name length name seqDir start end seqChunkSize/ }]; } -my $record_stream = $nameStore->sort_stream( sub { + + +my %trackHash; +my @tracksWithNames; +my $record_stream = sub { while( ! @namerecord_buffer ) { my $nameinfo = $name_records_iterator->() || do { my $file = shift @names_files; @@ -204,8 +193,12 @@ =head1 OPTIONS } } return shift @namerecord_buffer; -}); +}; + +# sort the stream by hash key to improve cache locality +$record_stream = $nameStore->sort_stream( $record_stream ); +# now write it to the store while( my $record = $record_stream->() ) { insert( $nameStore, $record ); } @@ -213,6 +206,7 @@ =head1 OPTIONS # store the list of tracks that have names $nameStore->{meta}{track_names} = \@tracksWithNames; + # set up the name store in the trackList.json $gdb->modifyTrackList( sub { my ( $data ) = @_; diff --git a/src/perl5/Bio/JBrowse/HashStore.pm b/src/perl5/Bio/JBrowse/HashStore.pm index cd8be10b5f..7cd3914b6e 100644 --- a/src/perl5/Bio/JBrowse/HashStore.pm +++ b/src/perl5/Bio/JBrowse/HashStore.pm @@ -55,10 +55,11 @@ sub open { %$self = ( %$self, - %{$self->_read_meta} + meta => $self->_read_meta ); - $self->{hash_bits} ||= 16; + $self->{hash_bits} ||= $self->{meta}{hash_bits} || 16; + $self->{meta}{hash_bits} = $self->{hash_bits}; $self->{hash_characters} = int( $self->{hash_bits}/4 ); $self->{file_extension} = '.json'; @@ -74,12 +75,8 @@ sub DESTROY { File::Path::mkpath( $self->{dir} ); my $meta_path = $self->_meta_path; CORE::open my $out, '>', $meta_path or die "$! writing $meta_path"; - $out->print( JSON::to_json( - { - hash_bits => $self->{hash_bits}, - %{ $self->{meta} || {} } - } - )) or die "$! writing $meta_path"; + $out->print( JSON::to_json( $self->{meta} ) ) + or die "$! writing $meta_path"; } sub _meta_path { File::Spec->catfile( shift->{dir}, 'meta.json' ); diff --git a/tests/data/volvox_formatted_names/names/2.json b/tests/data/volvox_formatted_names/names/2.json index aa019bef62..99fcd8aa55 100644 --- a/tests/data/volvox_formatted_names/names/2.json +++ b/tests/data/volvox_formatted_names/names/2.json @@ -1 +1 @@ -{"f0":{"exact":[],"prefix":["f05","f02","f03","f04","f01","f06","f09","f07","f08"]},"ctg":{"exact":[],"prefix":["ctgB","ctgA"]},"ag":{"exact":[],"prefix":["agt767.5","agt221.3","agt830.3","agt221.5","agt767.3","agt830.5"]},"protein":{"exact":[],"prefix":["Protein:HGA","Protein:HGB"]},"f12":{"exact":[["f12",0,"f12","ctgA",49757,50000]],"prefix":[]},"m02":{"exact":[["m02",3,"m02","ctgA",28331,30033]],"prefix":[]},"b10":{"exact":[],"prefix":["b101.2"]},"seg":{"exact":[],"prefix":["seg04","seg14","seg13","seg03","seg12","seg02","seg05","seg15","seg10","seg07","seg08","seg06","seg09","seg11","seg01"]},"agt830.3":{"exact":[["agt830.3",10,"agt830.3","ctgA",5409,7503]],"prefix":[]},"JBROWSE_TRACKS_WITH_NAMES":["ExampleFeatures","NameTest","snps","Motifs","Alignments","Genes","ReadingFrame","CDS","Transcript","Clones","EST"],"seg13":{"exact":[["seg13",4,"seg13","ctgA",49405,49476],["seg13",4,"seg13","ctgA",49761,50000]],"prefix":[]},"agt7":{"exact":[],"prefix":["agt767.5","agt767.3"]}} \ No newline at end of file +{"f0":{"exact":[],"prefix":["f05","f02","f03","f04","f01","f06","f09","f07","f08"]},"ctg":{"exact":[],"prefix":["ctgB","ctgA"]},"ag":{"exact":[],"prefix":["agt767.5","agt221.3","agt830.3","agt221.5","agt767.3","agt830.5"]},"protein":{"exact":[],"prefix":["Protein:HGA","Protein:HGB"]},"f12":{"exact":[["f12",0,"f12","ctgA",49757,50000]],"prefix":[]},"m02":{"exact":[["m02",3,"m02","ctgA",28331,30033]],"prefix":[]},"b10":{"exact":[],"prefix":["b101.2"]},"seg":{"exact":[],"prefix":["seg04","seg14","seg13","seg03","seg12","seg02","seg05","seg15","seg10","seg07","seg08","seg06","seg09","seg11","seg01"]},"agt830.3":{"exact":[["agt830.3",10,"agt830.3","ctgA",5409,7503]],"prefix":[]},"seg13":{"exact":[["seg13",4,"seg13","ctgA",49405,49476],["seg13",4,"seg13","ctgA",49761,50000]],"prefix":[]},"agt7":{"exact":[],"prefix":["agt767.5","agt767.3"]}} \ No newline at end of file diff --git a/tests/data/volvox_formatted_names/names/meta.json b/tests/data/volvox_formatted_names/names/meta.json index 2672d751a2..2776b0bcce 100644 --- a/tests/data/volvox_formatted_names/names/meta.json +++ b/tests/data/volvox_formatted_names/names/meta.json @@ -1 +1 @@ -{"hash_bits":"4"} \ No newline at end of file +{"track_names":["ExampleFeatures","NameTest","snps","Motifs","Alignments","Genes","ReadingFrame","CDS","Transcript","Clones","EST"],"hash_bits":"4","last_changed_entry":"apple2"} \ No newline at end of file