Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/usr/bin/perl | |
| # Copyright (c) 2011 Erik Aronesty (erik@q32.com) | |
| # | |
| # Permission is hereby granted, free of charge, to any person obtaining a copy | |
| # of this software and associated documentation files (the "Software"), to deal | |
| # in the Software without restriction, including without limitation the rights | |
| # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| # copies of the Software, and to permit persons to whom the Software is | |
| # furnished to do so, subject to the following conditions: | |
| # | |
| # The above copyright notice and this permission notice shall be included in | |
| # all copies or substantial portions of the Software. | |
| # | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| # THE SOFTWARE. | |
| # | |
| # NOTE: Please let me know if you use it or like it, AKA: "Thank You Mask Man" | |
| use strict; | |
| use Getopt::Long; | |
| our $VERSION = 1.2; | |
| my %nl; | |
| my $w=200000; # window size | |
| my $x=10; # segment count | |
| my $only; | |
| GetOptions("only"=>\$only, "window=i"=>\$w, "segs=i"=>\$x); | |
| die usage() unless @ARGV; | |
| die usage() if $only && @ARGV > 1; | |
| $|=1; | |
| #$x-=1; | |
| my $tl=0; | |
| for my $f (@ARGV) { | |
| my $s = -s $f; | |
| my $gz = $f =~ /\.gz$/; | |
| open (IN, ($gz ? "gunzip -c '$f'|" : $f)) || (warn("$f: $!\n"), next); | |
| my $nl = 0; | |
| my $gzratio = 1; | |
| if ($gz) { | |
| if ( $s > $w ) { | |
| # todo, this can be done all-in-one stream, by buffering the read and gzipping in a loop | |
| # it will be much faster! | |
| my (@bz, @lz); | |
| $x = 5; # 5 points | |
| my $ss = $w/$x; | |
| for (my $i=0; $i<$x; ++$i) { | |
| my $o = int($ss * ($i+1)); | |
| $bz[$i]=`gunzip -c '$f' | head -$o | gzip -c | wc -c`; | |
| if (abs($s-$bz[$i]) < ($ss+length($f))) { | |
| $nl=`gunzip -c '$f' | wc -l`; | |
| goto MAXXED; | |
| } else { | |
| $lz[$i]=$s*$o/$bz[$i]; | |
| } | |
| # warn("o: ", $o, ", lzi:", $lz[$i], ", bzi:", $bz[$i], "\n"); | |
| } | |
| # simple linear model log(bytes)~lines | |
| my ($xxs, $xys, $xs, $ys); | |
| for (my $i=0; $i<$x; ++$i) { | |
| $xs+=log($bz[$i]); | |
| $ys+=$lz[$i]; | |
| $xxs+=log($bz[$i])*log($bz[$i]); | |
| $xys+=$lz[$i]*log($bz[$i]); | |
| } | |
| my $slope = ($xys - $xs*$ys/$x) / ($xxs - $xs*$xs/$x); | |
| my $intercept = ($ys - ($slope*$xs))/$x; | |
| # warn("intercept: $intercept, slope: $slope, s: $s\n"); | |
| # log size predictive of lines..... | |
| $nl = $intercept + $slope * log($s); | |
| MAXXED: | |
| } else { | |
| close(IN); | |
| $nl = `gunzip -c '$f' | wc -l` + 0; | |
| } | |
| } else { | |
| my $rc = 0; | |
| if ($s > ($w*2)) { | |
| my $ss = $x == 1 ? $w : (($s-($w/$x))/($x-1)); | |
| my $d; | |
| # warn("segments: $x, window: $w, ss:$ss\n"); | |
| my $tweight; | |
| for (my $i=0;$i<$x;++$i) { | |
| seek(IN, $i*$ss, 0) if $i > 0; | |
| read(IN, $d, $w/$x); | |
| #whole lines only, prevents overcounting | |
| if ($i > 0) { | |
| # seek before current block, getting more accurate "long-first-lines" | |
| my $z; | |
| seek(IN, $i*$ss-1000, 0); | |
| read(IN, $z, 1000); | |
| $z=substr($z, rindex($z, "\n")+1); | |
| $d=$z.$d; | |
| } else { | |
| #$d=substr($d, index($d, "\n")+1); # remove first line | |
| } | |
| $d=substr($d, 0, rindex($d, "\n")+1); # remove last line | |
| $rc += length($d); | |
| $nl += xlc($d); | |
| # printf STDERR "seek: %d, len: %d, xlc: %d, rc: %d, nl: %d\n", $i*$ss, length($d), xlc($d), $rc, $nl; | |
| } | |
| } else { | |
| my $d; | |
| read(IN, $d, $w); | |
| $d=substr($d, 0, rindex($d, "\n")+1); # remove last line | |
| $rc += length($d); | |
| $nl += xlc($d); | |
| } | |
| $nl = int($rc > 0 ? $nl * $s / $rc : 0); | |
| } | |
| $tl += $nl; | |
| print "$nl" if $only; | |
| print "\n" if $only && -t STDOUT; | |
| $nl{$f} = $nl if !$only; | |
| } | |
| if (!$only) { | |
| my $m=length($tl)+1; | |
| for my $f (@ARGV) { | |
| printf "%*d %s\n", $m, $nl{$f}, $f; | |
| } | |
| printf "%*d %s\n", $m, $tl, "total" if @ARGV > 1; | |
| } | |
| sub xlc { | |
| my $d = $_[0]; | |
| my $p=0; | |
| my $c=0; | |
| my $i=0; | |
| while (($i=index($d, "\n", $p))>=0) { | |
| ++$c; | |
| $p=$i+1; | |
| } | |
| return $p<length($d) ? $c+1 : $c; # correct count | |
| } | |
| sub usage {<<EOF | |
| Usage: alc [-o] <file1> [<file2>] ... | |
| Approximate line counts for each file. Attempts to be | |
| somewhat compatible with "wc -l" by default. | |
| -o|--only Output line count only for a single file. | |
| -w|--window <int> Read <int> bytes from head, mid, and tail. | |
| -s|--segs <int> Divide file & window into <int> segments. | |
| EOF | |
| }; | |
| __END__ | |
| =head1 NAME | |
| alc - Approximate line count | |
| =head1 DESCRIPTION | |
| Approximate line counts for each file. Attempts to be | |
| somewhat compatible with "wc -l" by default. | |
| =head1 AUTHOR | |
| Erik Aronesty C<earonesty@cpan.org> | |
| =head1 LICENSE | |
| This program is free software; you can redistribute it and/or | |
| modify it under the same terms as Perl itself. | |
| See L<http://www.perl.com/perl/misc/Artistic.html>. | |
| =cut |