<?xml version="1.0" encoding="UTF-8"?>
<commit>
  <added type="array">
    <added>
      <filename>Rakefile~</filename>
    </added>
    <added>
      <filename>filtering-with-tokenise.diff</filename>
    </added>
  </added>
  <modified type="array">
    <modified>
      <diff>@@ -12,7 +12,8 @@ Hoe.new(&quot;pci4r&quot;, Pci4R::VERSION) do |p|
   p.url = &quot;http://github.com/alexvollmer/pci4r/tree/master&quot;
   p.extra_deps = [
     ['activerecord', '&gt;=2.0.0'],
-    ['sqlite3-ruby', '&gt;=1.2.1']
+    ['sqlite3-ruby', '&gt;=1.2.1'],
+    ['stemmer'     , '&gt;=1.0.1']
   ]
 end
 </diff>
      <filename>Rakefile</filename>
    </modified>
    <modified>
      <diff>@@ -1,4 +1,5 @@
 require &quot;set&quot;
+require &quot;stemmer&quot;
 
 ##
 # This module provides a number of +Classifier+ classes that are instantiated,
@@ -516,4 +517,62 @@ module Filtering
       [sum, 1.0].min
     end
   end
-end
\ No newline at end of file
+end
+
+class String
+
+  @@n_gram_length = 1
+
+  # These are copied uncritically from Lucas Carlson's Classifier library.
+  @@stop_words = %w(a again all along are also an and as at but by came can cant couldnt did didn didnt do doesnt dont ever first from have her here him how i if in into is isnt it itll just last least like most my new no not now of on or should sinc so some th than this that the their then those to told too true try until url us were when whether while with within yes you youll)
+
+  def self.n_gram_length=(newval)
+    @@n_gram_length = newval
+  end
+
+  def self.stop_words=(newval)
+    @@stop_words = newval
+  end
+  def self.stop_words
+    @@stop_words
+  end
+  def self.add_stop_words(adds)
+    @@stop_words += adds.map {|w| w.downcase}
+    @@stop_words.uniq!
+  end
+  def self.remove_stop_words(removes)
+    @@stop_words -= removes
+  end
+  def tokenise
+    # Doing some regex massaging.
+    # Replace single or double dashes w/a single space.  Replace tildes w/spaces
+	  text = gsub(/(-{1,2}|~)/, ' ').downcase
+
+    # Eat punctuation
+    text.gsub!(/[^\w\s]/, '')
+
+    # Change digits to placeholder
+    text.gsub!(/\d/, '#')
+
+    words = text.split.select {|w| w.length &gt; 1 and not(@@stop_words.include?(w))}
+
+    tokens = []
+    0.upto(words.length) do |i|
+      @@n_gram_length.downto(1) do |k|
+        chunk = words[i, k]
+        if chunk.length == k then
+          # TODO?: keep from crossing obvious sentence boundaries (e.g., test for periods in elements other than chunk[-1])
+          tokens &lt;&lt; chunk.map do |w|
+            # puts(w)
+            w.stem
+          end.join('_')
+        end
+      end
+    end
+
+    return tokens # + words.map {|w| w.stem}
+
+  end
+
+end
+</diff>
      <filename>lib/filtering.rb</filename>
    </modified>
    <modified>
      <diff>@@ -4,6 +4,40 @@ require File.join(File.dirname(__FILE__), &quot;..&quot;, &quot;lib&quot;, &quot;filtering&quot;)
 require &quot;tempfile&quot;
 
 describe &quot;Filtering&quot; do
+  describe &quot;tokenise&quot; do
+    it &quot;should ignore stop words&quot; do
+      @words = &quot;I told him to ignore most of these&quot;.tokenise
+      @words.should_not be_member(&quot;told&quot;.stem)
+      @words.should_not be_member(&quot;him&quot;.stem)
+      @words.should be_member(&quot;ignore&quot;.stem)
+      @words.should be_member(&quot;these&quot;.stem)
+    end
+
+    it &quot;should honor added stop words&quot; do
+      String.add_stop_words(%w(ignore all these))
+      @words = &quot;I told him to ignore most of these&quot;.tokenise
+      @words.should_not be_member(&quot;told&quot;.stem)
+      @words.should_not be_member(&quot;him&quot;.stem)
+      @words.should_not be_member(&quot;ignore&quot;.stem)
+      @words.should_not be_member(&quot;these&quot;.stem)
+    end
+
+    it &quot;should honor removed stop words&quot; do
+      String.remove_stop_words(%w(dont not cant couldnt))
+      @words = &quot;I don't want you trying to do things you can't&quot;.tokenise
+      @words.should be_member(&quot;dont&quot;.stem)
+      @words.should be_member(&quot;cant&quot;.stem)
+    end
+
+    it &quot;should tokenise properly&quot; do
+      str = &quot;The fisherman fished a fish out of the fishing holes&quot;
+      expected = %w(fisherman fish fish out fish hole)
+      str.tokenise.should == expected
+      String.n_gram_length = 2
+      expected = %w(fisherman_fish fisherman fish_fish fish fish_out fish out_fish out fish_hole fish hole)
+      str.tokenise.should == expected
+    end
+  end
   describe &quot;get_words&quot; do
 
     before(:each) do
@@ -130,6 +164,11 @@ describe &quot;Filtering&quot; do
       @classifier.minimums[:bad] = 0.8
       @classifier.classify(&quot;quick money&quot;).should == :good
     end
+
+	it &quot;should handle long documents w/out barfing&quot; do
+		@classifier.classify(&quot;make penis fast &quot; * 100000).should == :bad
+	end
+
   end
 
   describe &quot;ActiveRecord persistence&quot; do
@@ -161,5 +200,6 @@ describe &quot;Filtering&quot; do
       @classifier.feature_count(&quot;dog&quot;, :good).should == 1.0
       @classifier.feature_count(&quot;dog&quot;, :bad).should == 0.0
     end
+		
   end
-end
\ No newline at end of file
+end</diff>
      <filename>spec/filtering_spec.rb</filename>
    </modified>
  </modified>
  <removed type="array"/>
  <parents type="array">
    <parent>
      <id>6fa6608acc023e95be1579ed72b63bdd612b9e96</id>
    </parent>
  </parents>
  <author>
    <name>Roy Pardee</name>
    <email>roy@kif.(none)</email>
  </author>
  <url>http://github.com/alexvollmer/pci4r/commit/4d88b00958defd05827217b27bfb4daf6b0a7084</url>
  <id>4d88b00958defd05827217b27bfb4daf6b0a7084</id>
  <committed-date>2009-04-12T12:40:49-07:00</committed-date>
  <authored-date>2009-04-12T12:40:49-07:00</authored-date>
  <message>Added reference to Stemmer lib to hoe...spec?</message>
  <tree>a2e9891e9f0def52dadcc97fff62c865cf8c84dd</tree>
  <committer>
    <name>Roy Pardee</name>
    <email>roy@kif.(none)</email>
  </committer>
</commit>
