-
Notifications
You must be signed in to change notification settings - Fork 199
/
generate-names.pl
executable file
·332 lines (264 loc) · 9.04 KB
/
generate-names.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#!/usr/bin/env perl
=head1 NAME
generate-names.pl - generate a global index of feature names
=head1 USAGE
generate-names.pl \
[ --out <output directory> ] \
[ --verbose ]
=head1 OPTIONS
=over 4
=item --out <directory>
Data directory to process. Default 'data/'.
=item --tracks <trackname>[,...]
Comma-separated list of which tracks to include in the names index. If
not passed, all tracks are indexed.
=item --locationLimit <number>
Maximum number of distinct locations to store for a single name. Default 100.
=item --completionLimit <number>
Maximum number of completions to store for a given prefix. Default 20.
=item --totalNames <number>
Optional estimate of the total number of names that will go into this
names index. Used to choose some parameters for how the name index is
built. If not passed, tries to estimate this based on the size of the
input names files.
=item --verbose
Print more progress messages.
=item --help | -h | -?
Print a usage message.
=back
=cut
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin/../src/perl5";
use JBlibs;
use Carp::Always;
use Fcntl ":flock";
use File::Spec::Functions;
use Getopt::Long;
use Pod::Usage;
use List::Util qw/ sum min max /;
use PerlIO::gzip;
use JSON 2;
use Bio::JBrowse::HashStore;
use GenomeDB;
my @includedTrackNames;
my $outDir = "data";
my $verbose = 0;
my $help;
my $max_completions = 20;
my $max_locations = 100;
my $thresh;
my $est_total_name_records;
my $hash_bits;
GetOptions("dir|out=s" => \$outDir,
"completionLimit=i" => \$max_completions,
"locationLimit=i" => \$max_locations,
"verbose+" => \$verbose,
"thresh=i" => \$thresh,
"totalNames=i" => \$est_total_name_records,
'tracks=s' => \@includedTrackNames,
'hashBits=i' => \$hash_bits,
"help|h|?" => \$help) or pod2usage();
my %includedTrackNames = map { $_ => 1 }
map { split ',', $_ }
@includedTrackNames;
pod2usage( -verbose => 2 ) if $help;
unless (-d $outDir) {
die <<OUTDIR;
Can't find directory "$outDir".
Run this program from a different working directory,
or specify the location of the output directory with
the --dir command line option.
OUTDIR
}
my $gdb = GenomeDB->new( $outDir );
my @refSeqs = @{ $gdb->refSeqs };
unless( @refSeqs ) {
die "No reference sequences defined in configuration, nothing to do.\n";
}
my @tracks = grep { !%includedTrackNames || $includedTrackNames{ $_->{label} } }
@{ $gdb->trackList || [] };
unless( @tracks ) {
die "No tracks defined in configuration, nothing to do.\n";
}
if( $verbose ) {
print STDERR "Tracks:\n".join('', map " $_->{label}\n", @tracks );
}
# read the name list for each track that has one
my $trackNum = 0;
# find the names files we will be working with
my @names_files = find_names_files();
if( ! @names_files ) {
warn "WARNING: No feature names found for indexing, only reference sequence names will be indexed.\n";
}
#print STDERR "Names files:\n", map " $_->{fullpath}\n", @names_files;
# estimate the total number of name records we probably have based on the input file sizes
$est_total_name_records ||= int( (sum( map { -s $_->{fullpath} } @names_files )||0) / 70 );
if( $verbose ) {
print STDERR "Estimated $est_total_name_records total name records to index.\n";
}
my $nameStore = Bio::JBrowse::HashStore->open(
dir => catdir( $outDir, "names" ),
empty => 1,
# set the hash size to try to get about 10 name records per file
# (does not count prefix completions) if the store has existing
# data in it, this will be ignored
hash_bits => $hash_bits || (
$est_total_name_records
? sprintf('%0.0f',max( 4, min( 32, 4*int( log( ($est_total_name_records||0) / 50 )/ 4 / log(2)) )))
: 12
),
);
if( $verbose ) {
print STDERR "Using ".$nameStore->{hash_bits}."-bit hashing.\n";
}
# insert a name record for all of the reference sequences
my $name_records_iterator = sub {};
my @namerecord_buffer;
for my $ref ( @refSeqs ) {
push @namerecord_buffer, [ @{$ref}{ qw/ name length name seqDir start end seqChunkSize/ }];
}
my %trackHash;
my @tracksWithNames;
my $record_stream = sub {
while( ! @namerecord_buffer ) {
my $nameinfo = $name_records_iterator->() || do {
my $file = shift @names_files;
return unless $file;
#print STDERR "Processing $file->{fullpath}\n";
$name_records_iterator = make_names_iterator( $file );
$name_records_iterator->();
} or return;
foreach my $alias ( @{$nameinfo->[0]} ) {
my $track = $nameinfo->[1];
unless ( defined $trackHash{$track} ) {
$trackHash{$track} = $trackNum++;
push @tracksWithNames, $track;
}
push @namerecord_buffer, [
$alias,
$trackHash{$track},
@{$nameinfo}[2..$#{$nameinfo}]
];
}
}
return shift @namerecord_buffer;
};
# sort the stream by hash key to improve cache locality
$record_stream = $nameStore->sort_stream( $record_stream );
# now write it to the store
while( my $record = $record_stream->() ) {
insert( $nameStore, $record );
}
# store the list of tracks that have names
$nameStore->{meta}{track_names} = \@tracksWithNames;
# set up the name store in the trackList.json
$gdb->modifyTrackList( sub {
my ( $data ) = @_;
$data->{names}{type} = 'Hash';
$data->{names}{url} = 'names/';
return $data;
});
exit;
################ HELPER SUBROUTINES ##############################
sub find_names_files {
my @files;
for my $track (@tracks) {
for my $ref (@refSeqs) {
my $dir = catdir( $outDir,
"tracks",
$track->{label},
$ref->{name}
);
# read either names.txt or names.json files
my $name_records_iterator;
my $names_txt = catfile( $dir, 'names.txt' );
if( -f $names_txt ) {
push @files, { fullpath => $names_txt, type => 'txt' };
}
else {
my $names_json = catfile( $dir, 'names.json' );
if( -f $names_json ) {
push @files, { fullpath => $names_json, type => 'json', namestxt => $names_txt };
}
}
}
}
return @files;
}
sub insert {
my ( $store, $record ) = @_;
my $name = $record->[0];
my $lc = lc $name;
{ # store the exact name match
my $r = $store->get( $lc ) || { exact => [], prefix => [] };
if( $max_locations && @{ $r->{exact} } < $max_locations ) {
push @{ $r->{exact} }, $record;
$store->set( $lc, $r );
}
elsif( $verbose ) {
#print STDERR "Warning: $name has more than --locationLimit ($max_locations) distinct locations, not all of them will be indexed.\n";
}
}
# generate all the prefixes
my @prefixes;
chop $lc;
while( $lc ) {
push @prefixes, $lc;
chop $lc;
}
# store the prefixes
for my $prefix ( @prefixes ) {
my $r = $store->get( $prefix ) || { exact => [], prefix => [] };
my $p = $r->{prefix};
if( @$p < $max_completions ) {
if( ! grep $name eq $_, @$p ) {
push @{ $r->{prefix} }, $name;
$store->set( $prefix, $r );
}
}
elsif( @{ $r->{prefix} } == $max_completions ) {
push @{ $r->{prefix} }, { name => 'too many matches', hitLimit => 1 };
$store->set( $prefix, $r );
}
}
}
# each of these takes an input filename and returns a subroutine that
# returns name records until there are no more, for either names.txt
# files or old-style names.json files
sub make_names_iterator {
my ( $file_record ) = @_;
if( $file_record->{type} eq 'txt' ) {
my $input_fh = open_names_file( $file_record->{fullpath} );
# read the input json partly with low-level parsing so that we
# can parse incrementally from the filehandle. names list
# files can be very big.
return sub {
my $t = <$input_fh>;
return $t ? eval { JSON::from_json( $t ) } : undef;
};
}
elsif( $file_record->{type} eq 'json' ) {
# read old-style names.json files all from memory
my $input_fh = open_names_file( $file_record->{fullpath} );
my $data = JSON::from_json(do {
local $/;
scalar <$input_fh>
});
open my $nt, '>', $file_record->{namestxt} or die;
return sub {
my $rec = shift @$data;
if( $rec ) {
$nt->print(JSON::to_json($rec)."\n");
}
return $rec;
};
}
}
sub open_names_file {
my ( $infile ) = @_;
my $gzip = $infile =~ /\.(txt|json)z$/ ? ':gzip' : '';
open my $fh, "<$gzip", $infile or die "$! reading $infile";
return $fh;
}