public
Description: A Simple Ruby Rule-Based Part of Speech Tagger based on Eric Brill Tagger
Clone URL: git://github.com/taf2/rb-brill-tagger.git
Avoid dropping possible tags when the number of matched tags is very 
small. For example, if the resulting tags is less then the max (defaults 
to 10) - it would always return n-1 matched tags.
taf2 (author)
Wed Aug 27 13:56:21 -0700 2008
commit  f6d9f4ee06c7710cc67397d6fcd724ce7fa762f2
tree    51f3bca0fbee36216e9ff10a954d582a97da97be
parent  6783c87ab172a8b928f6601d1a71015ae9d1ea9c
...
101
102
103
104
105
106
107
 
108
109
110
111
 
112
113
114
...
129
130
131
 
 
 
 
 
 
 
 
 
 
 
132
133
134
135
136
 
137
138
139
...
142
143
144
145
146
147
148
149
150
151
152
 
 
153
154
155
...
101
102
103
 
104
105
106
107
108
109
110
111
112
113
114
115
...
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
 
148
149
150
151
...
154
155
156
 
157
158
159
160
161
 
 
162
163
164
165
166
0
@@ -101,14 +101,15 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
0
     for( short j = 1; (j <= this->nwords) && ((i+j) < words.size()); ++j ) {
0
       matched = this->tags.find( match_word );
0
       if( matched != this->tags.end() ){
0
- //printf( "word: %d:(%s->%s)\n", i, match_word.c_str(), matched->second.c_str() );
0
         std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
0
         if( mloc == matched_tags.end() ) {
0
           matched_tags[matched->second] = 1; // count 1
0
+ //printf( "word: %d:(%s->%s) %d, hits: 1\n", i, match_word.c_str(), matched->second.c_str(), j );
0
         }
0
         else {
0
           mloc->second++;
0
           if( max_count < mloc->second ) { max_count = mloc->second; }
0
+ //printf( "word: %d:(%s->%s) %d, hits: %d\n", i, match_word.c_str(), matched->second.c_str(), j, mloc->second );
0
         }
0
       }
0
       // stem each word and compare against our tag bank
0
@@ -129,11 +130,22 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
0
       }
0
     }
0
   }
0
+
0
+ std::vector< std::string > reduced_tags;
0
+
0
+ // now we have a list of tags that match within the document text, check if we need to reduce the tags
0
+ if( matched_tags.size() < max ) {
0
+ // prepare the return vector
0
+ for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
0
+ reduced_tags.push_back( mloc->first );
0
+ }
0
+ return reduced_tags;
0
+ }
0
 
0
   // now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
0
   std::vector< std::pair<std::string,int> > sorted_tags;
0
 
0
- //printf( "max frequency: %d\n", max_count );
0
+ //printf( "max frequency: %d, total tagged: %d, reducing to %d\n", max_count, matched_tags.size(), max );
0
   for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
0
     //printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
0
     sorted_tags.push_back(*mloc);
0
@@ -142,14 +154,13 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
0
   // sort the tags in frequency order
0
   std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
0
 
0
- std::vector< std::string > reduced_tags;
0
 
0
   std::vector< std::pair<std::string, int> >::iterator mloc;
0
   do {
0
     for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
0
       std::pair< std::string, int > word_freq = *mloc;
0
- // printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
0
- //printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
0
+ printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
0
+ printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
0
       if( word_freq.second < max_count ) {
0
         sorted_tags.erase( mloc );
0
         break;
...
16
17
18
 
 
 
 
 
 
 
 
19
20
21
...
16
17
18
19
20
21
22
23
24
25
26
27
28
29
0
@@ -16,6 +16,14 @@ class TestWordTagger < Test::Unit::TestCase
0
     puts "Duration: #{Time.now - timer} sec"
0
   end
0
 
0
+ def test_sample_bug
0
+ tags = ["foo", "bar", "baz", "squishy", "yummy"]
0
+ txt = 'This is some sample text. Foo walked into a bar. The bartender said "What can I get you?" Foo said he wanted something yummy - like a baz.'
0
+ tagger = Word::Tagger.new tags, :words => 4
0
+ result_tags = tagger.execute( txt )
0
+ assert_equal ["bar", "baz", "foo", "yummy"], result_tags
0
+ end
0
+
0
   def test_ngram_size3
0
     timer = Time.now
0
     text = "This body of text contains something like ventricular septal defect"

Comments

    No one has commented yet.