0
@@ -101,14 +101,15 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
0
for( short j = 1; (j <= this->nwords) && ((i+j) < words.size()); ++j ) {
0
matched = this->tags.find( match_word );
0
if( matched != this->tags.end() ){
0
- //printf( "word: %d:(%s->%s)\n", i, match_word.c_str(), matched->second.c_str() );
0
std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
0
if( mloc == matched_tags.end() ) {
0
matched_tags[matched->second] = 1; // count 1
0
+ //printf( "word: %d:(%s->%s) %d, hits: 1\n", i, match_word.c_str(), matched->second.c_str(), j );
0
if( max_count < mloc->second ) { max_count = mloc->second; }
0
+ //printf( "word: %d:(%s->%s) %d, hits: %d\n", i, match_word.c_str(), matched->second.c_str(), j, mloc->second );
0
// stem each word and compare against our tag bank
0
@@ -129,11 +130,22 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
0
+ std::vector< std::string > reduced_tags;
0
+ // now we have a list of tags that match within the document text, check if we need to reduce the tags
0
+ if( matched_tags.size() < max ) {
0
+ // prepare the return vector
0
+ for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
0
+ reduced_tags.push_back( mloc->first );
0
// now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
0
std::vector< std::pair<std::string,int> > sorted_tags;
0
- //printf( "max frequency: %d
\n", max_count );
0
+ //printf( "max frequency: %d
, total tagged: %d, reducing to %d\n", max_count, matched_tags.size(), max );
0
for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
0
//printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
0
sorted_tags.push_back(*mloc);
0
@@ -142,14 +154,13 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
0
// sort the tags in frequency order
0
std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
0
- std::vector< std::string > reduced_tags;
0
std::vector< std::pair<std::string, int> >::iterator mloc;
0
for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
0
std::pair< std::string, int > word_freq = *mloc;
0
- // printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
0
- //printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
0
+ printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
0
+ printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
0
if( word_freq.second < max_count ) {
0
sorted_tags.erase( mloc );
Comments
No one has commented yet.