Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 179 lines (157 sloc) 5.54 KB
#!/usr/bin/perl
# Copyright (c) 2011 Erik Aronesty (erik@q32.com)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# NOTE: Please let me know if you use it or like it, AKA: "Thank You Mask Man"
use strict;
use Getopt::Long;
our $VERSION = 1.2;
my %nl;
my $w=200000; # window size
my $x=10; # segment count
my $only;
GetOptions("only"=>\$only, "window=i"=>\$w, "segs=i"=>\$x);
die usage() unless @ARGV;
die usage() if $only && @ARGV > 1;
$|=1;
#$x-=1;
my $tl=0;
for my $f (@ARGV) {
my $s = -s $f;
my $gz = $f =~ /\.gz$/;
open (IN, ($gz ? "gunzip -c '$f'|" : $f)) || (warn("$f: $!\n"), next);
my $nl = 0;
my $gzratio = 1;
if ($gz) {
if ( $s > $w ) {
# todo, this can be done all-in-one stream, by buffering the read and gzipping in a loop
# it will be much faster!
my (@bz, @lz);
$x = 5; # 5 points
my $ss = $w/$x;
for (my $i=0; $i<$x; ++$i) {
my $o = int($ss * ($i+1));
$bz[$i]=`gunzip -c '$f' | head -$o | gzip -c | wc -c`;
if (abs($s-$bz[$i]) < ($ss+length($f))) {
$nl=`gunzip -c '$f' | wc -l`;
goto MAXXED;
} else {
$lz[$i]=$s*$o/$bz[$i];
}
# warn("o: ", $o, ", lzi:", $lz[$i], ", bzi:", $bz[$i], "\n");
}
# simple linear model log(bytes)~lines
my ($xxs, $xys, $xs, $ys);
for (my $i=0; $i<$x; ++$i) {
$xs+=log($bz[$i]);
$ys+=$lz[$i];
$xxs+=log($bz[$i])*log($bz[$i]);
$xys+=$lz[$i]*log($bz[$i]);
}
my $slope = ($xys - $xs*$ys/$x) / ($xxs - $xs*$xs/$x);
my $intercept = ($ys - ($slope*$xs))/$x;
# warn("intercept: $intercept, slope: $slope, s: $s\n");
# log size predictive of lines.....
$nl = $intercept + $slope * log($s);
MAXXED:
} else {
close(IN);
$nl = `gunzip -c '$f' | wc -l` + 0;
}
} else {
my $rc = 0;
if ($s > ($w*2)) {
my $ss = $x == 1 ? $w : (($s-($w/$x))/($x-1));
my $d;
# warn("segments: $x, window: $w, ss:$ss\n");
my $tweight;
for (my $i=0;$i<$x;++$i) {
seek(IN, $i*$ss, 0) if $i > 0;
read(IN, $d, $w/$x);
#whole lines only, prevents overcounting
if ($i > 0) {
# seek before current block, getting more accurate "long-first-lines"
my $z;
seek(IN, $i*$ss-1000, 0);
read(IN, $z, 1000);
$z=substr($z, rindex($z, "\n")+1);
$d=$z.$d;
} else {
#$d=substr($d, index($d, "\n")+1); # remove first line
}
$d=substr($d, 0, rindex($d, "\n")+1); # remove last line
$rc += length($d);
$nl += xlc($d);
# printf STDERR "seek: %d, len: %d, xlc: %d, rc: %d, nl: %d\n", $i*$ss, length($d), xlc($d), $rc, $nl;
}
} else {
my $d;
read(IN, $d, $w);
$d=substr($d, 0, rindex($d, "\n")+1); # remove last line
$rc += length($d);
$nl += xlc($d);
}
$nl = int($rc > 0 ? $nl * $s / $rc : 0);
}
$tl += $nl;
print "$nl" if $only;
print "\n" if $only && -t STDOUT;
$nl{$f} = $nl if !$only;
}
if (!$only) {
my $m=length($tl)+1;
for my $f (@ARGV) {
printf "%*d %s\n", $m, $nl{$f}, $f;
}
printf "%*d %s\n", $m, $tl, "total" if @ARGV > 1;
}
sub xlc {
my $d = $_[0];
my $p=0;
my $c=0;
my $i=0;
while (($i=index($d, "\n", $p))>=0) {
++$c;
$p=$i+1;
}
return $p<length($d) ? $c+1 : $c; # correct count
}
sub usage {<<EOF
Usage: alc [-o] <file1> [<file2>] ...
Approximate line counts for each file. Attempts to be
somewhat compatible with "wc -l" by default.
-o|--only Output line count only for a single file.
-w|--window <int> Read <int> bytes from head, mid, and tail.
-s|--segs <int> Divide file & window into <int> segments.
EOF
};
__END__
=head1 NAME
alc - Approximate line count
=head1 DESCRIPTION
Approximate line counts for each file. Attempts to be
somewhat compatible with "wc -l" by default.
=head1 AUTHOR
Erik Aronesty C<earonesty@cpan.org>
=head1 LICENSE
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
See L<http://www.perl.com/perl/misc/Artistic.html>.
=cut