Skip to content
Browse files

+cluster_text_files

  • Loading branch information...
1 parent 6787e8a commit 65fae6acc0894673f6b85fd611cf117666a3d3f2 @DeeNewcum committed May 11, 2012
Showing with 188 additions and 0 deletions.
  1. +188 −0 cluster_text_files
View
188 cluster_text_files
@@ -0,0 +1,188 @@
+#!/usr/bin/perl
+
+# do cluster-analysis on text files, based on the Levenshtein metric
+
+# it analyzes ALL text files under the current directory
+
+# see also:
+# http://paperlined.org/dev/perl/modules/related_modules/text_cluster.html
+
+ use strict;
+ use warnings;
+
+ use Const::Fast;
+ use File::Find;
+ use String::Cluster::Hobohm;
+ use Time::HiRes qw[time];
+ use List::Util qw[min max];
+
+ use Data::Dumper;
+ #use Devel::Comments; # uncomment this during development to enable the ### debugging statements
+ #use Carp::Always;
+
+
+ const my $simil => 0.75; # a number between 0 and 1 1 means "exactly the same" (see String::Clusterer::Hobohm docs)
+ const my $SIMPLIFY => 1;
+
+
+setpriority 0, 0, getpriority(0, 0) + 20; # nice ourselves... this can be incredibly CPU-intensive
+
+
+my %text_files;
+my %text_files_inv; # keys and values are inverted here
+
+
+## list all text files, load their contents
+find({ wanted => \&file_find_wanted, follow => 1, no_chdir => 1 }, '.');
+
+ sub file_find_wanted {
+ if (-f $File::Find::name && -T $File::Find::name) {
+ my $cont = slurp($File::Find::name);
+ $text_files{$File::Find::name} = $cont;
+ push(@{$text_files_inv{$cont}}, $File::Find::name);
+ }
+ }
+
+
+print Dumper [keys %text_files];
+
+
+## demonstrate the output of simplify_string
+if (0) {
+ my @f = values %text_files;
+ print "$f[0]\n", "-"x80, "\n";
+ print simplify_string($f[0]), "\n";
+ exit;
+}
+
+
+## monitor the progress
+install_progress_monitor();
+
+
+## cluster them
+my $clusterer = String::Cluster::Hobohm->new( similarity => $simil );
+my @f = values %text_files;
+my %unsimplify;
+if ($SIMPLIFY) {
+ @f = map {my $s = simplify_string($_); $unsimplify{$s} = $_; $s} @f;
+ # ^^^ assume no conflicts? TODO: fix this
+ @f = uniq(@f);
+ #die Dumper \@f;
+}
+#die Dumper \@f;
+#die Dumper \%unsimplify;
+my $groups = $clusterer->cluster( \@f );
+#die Dumper $groups;
+
+
+## output the findings
+my $first_time = 1;
+foreach my $group (@$groups) {
+ if (scalar(@$group) > 1) {
+ print "-"x80, "\n" unless ($first_time);
+ $first_time = 0;
+ foreach my $cont (@$group) {
+ $cont = $$cont;
+ #print "---$cont---\n";
+ if ($SIMPLIFY) {
+ $cont = $unsimplify{$cont};
+ #die "$cont\n\t";
+ }
+ print join("\n", @{$text_files_inv{$cont}}), "\n";
+ }
+ }
+}
+
+
+
+# Calculating the distance between two very long strings is expensive.
+#
+# Reduce the string's size and/or complexity, as an effort to APPROXIMATE clustering.
+#
+# http://en.wikipedia.org/wiki/Dimension_reduction
+# http://en.wikipedia.org/wiki/Feature_extraction
+sub simplify_string {
+ my $string = shift;
+
+ # http://en.wikipedia.org/wiki/Feature_extraction
+ $string =~ s/[^(){}; "']//sg;
+ return $string . ' ';
+ # append space to workaround bug: https://rt.cpan.org/Public/Bug/Display.html?id=77148
+
+
+ ## note: study perlrecharclass before trying to alter this
+ if (0) {
+ my @s;
+ foreach (split /[\n\r]+/, $string) {
+
+ ## group 1) spaces, group 2) nonspaces
+ #s/\S+/S/g;
+ s/\S/S/g;
+
+ ## group1) word characters group2) non-word characters
+ #s/\w+/S/g;
+
+
+ #s/[^S]+/ /g;
+ s/[^S]/ /g;
+ push(@s, $_);
+ }
+ $string = join("\n", @s);
+ }
+
+ return substr($string, 0, 4 * 1024);
+
+
+
+# ## separate into 1) space/tab, 2) nonspace, 3) newlines
+# if (0) {
+# s/\S+/S/g;
+# s/\h+/ /g; # horizontal whitespace
+# }
+#
+# ## separate into 1) word-characters, 2) non-word characters => " ", 3) newlines
+# if (1) {
+# s/\w+/S/g;
+# s/[^\w\n\r]+/ /gs;
+# }
+#
+# s/\x00/\n/sg;
+#
+# return $_;
+}
+
+
+
+
+sub install_progress_monitor {
+ my $orig = \&String::Cluster::Hobohm::_similarity;
+ my $mon_count = 0;
+ my $mon_time = 0;
+ my $mon_max = 0;
+ my $mon_min = 9999;
+ no warnings 'redefine';
+ *String::Cluster::Hobohm::_similarity = sub {
+ my $t0 = time();
+ my $return = $orig->(@_);
+ my $ttot = time() - $t0;
+ $mon_count++;
+ $mon_time += $ttot;
+ $mon_max = max($mon_max, $ttot);
+ $mon_min = min($mon_min, $ttot);
+ #printf "%d comparisons at an average of %0.2fs each\n", $mon_count, $mon_time / $mon_count;
+ printf "comparison %d = %6d ms (%0.2f min, %0.2f ave, %0.2f max)\n",
+ $mon_count, $ttot * 1000,
+ $mon_min, $mon_time / $mon_count, $mon_max;
+ return $return;
+ };
+ use warnings;
+}
+
+
+# quickly read a whole file
+sub slurp {my$p=open(my$f,"$_[0]")or die$!;my@o=<$f>;close$f;waitpid($p,0);wantarray?@o:join("",@o)}
+
+# remove duplicate elements from a list
+sub uniq {my %seen; grep {!$seen{$_}++} @_}
+

0 comments on commit 65fae6a

Please sign in to comment.
Something went wrong with that request. Please try again.