Skip to content

Commit

Permalink
implemented pruning of subsumed ngrams
Browse files Browse the repository at this point in the history
  • Loading branch information
proycon committed Apr 10, 2020
1 parent 317537b commit 33957ed
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 5 deletions.
1 change: 1 addition & 0 deletions colibricore_classes.in.pxd
Expand Up @@ -331,6 +331,7 @@ cdef extern from "patternmodel.h":
bool DOREMOVEFLEXGRAMS
bool DORESET
int PRUNENONSUBSUMED
int PRUNESUBSUMED

cdef cppclass IndexedDataHandler:
unsigned int count(IndexedData &)
Expand Down
7 changes: 6 additions & 1 deletion colibricore_wrapper.in.pyx
Expand Up @@ -899,7 +899,8 @@ cdef class PatternModelOptions:
* DOREMOVESKIPGRAMS - Remove skipgrams from the model
* DOREMOVEFLEXGRAMS - Remove flexgrams from the model
* DORESET - Reset all counts before training
* PRUNENONSUBSUMED - Prune all n-grams up to this length that are not subsumed by higher-order ngrams
* PRUNENONSUBSUMED - Prune all n-grams up to this length that are *NOT* subsumed by higher-order ngrams
* PRUNESUBSUMED - Prune all n-grams up to this length that are subsumed by higher-order ngrams
* DEBUG
* QUIET (default: False)
Expand Down Expand Up @@ -950,6 +951,8 @@ cdef class PatternModelOptions:
self.coptions.QUIET = value
elif key == 'PRUNENONSUBSUMED':
self.coptions.PRUNENONSUBSUMED = value
elif key == 'PRUNESUBSUMED':
self.coptions.PRUNESUBSUMED = value
else:
raise KeyError

Expand Down Expand Up @@ -990,6 +993,8 @@ cdef class PatternModelOptions:
return self.coptions.QUIET
elif key == 'PRUNENONSUBSUMED':
return self.coptions.PRUNENONSUBSUMED
elif key == 'PRUNESUBSUMED':
return self.coptions.PRUNESUBSUMED
else:
raise KeyError

Expand Down
2 changes: 1 addition & 1 deletion configure.ac
Expand Up @@ -4,7 +4,7 @@
# $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/configure.ac $

AC_PREREQ([2.67])
AC_INIT([colibri-core],[2.5.3],[proycon@anaproy.nl])
AC_INIT([colibri-core],[2.5.4],[proycon@anaproy.nl])
AC_CONFIG_SRCDIR([configure.ac])
AC_CONFIG_MACRO_DIR([m4])
AC_CONFIG_HEADER([config.h])
Expand Down
57 changes: 55 additions & 2 deletions include/patternmodel.h
Expand Up @@ -141,7 +141,8 @@ class PatternModelOptions {
bool DOREVERSEINDEX; ///< Obsolete now, only here for backward-compatibility with v1
bool DOPATTERNPERLINE; ///< Assume each line contains one integral pattern, rather than actively extracting all subpatterns on a line (default: false)

int PRUNENONSUBSUMED; //< Prune all n-grams that are not subsumed by higher-order ngrams
int PRUNENONSUBSUMED; //< Prune all n-grams that are **NOT** subsumed by higher-order ngrams
int PRUNESUBSUMED; //< Prune all n-grams that are subsumed by higher-order ngrams

bool DOREMOVEINDEX; ///< Do not load index information (for indexed models), loads just the patterns without any counts
bool DOREMOVENGRAMS; ///< Remove n-grams from the model upon loading it
Expand Down Expand Up @@ -178,7 +179,8 @@ class PatternModelOptions {
DOREMOVESKIPGRAMS = false;
DOREMOVEFLEXGRAMS = false;

PRUNENONSUBSUMED = false;
PRUNENONSUBSUMED = 0;
PRUNESUBSUMED = 0;

DEBUG = false;
QUIET = false;
Expand Down Expand Up @@ -210,6 +212,7 @@ class PatternModelOptions {
DOREMOVEFLEXGRAMS = ref.DOREMOVEFLEXGRAMS;

PRUNENONSUBSUMED = ref.PRUNENONSUBSUMED;
PRUNESUBSUMED = ref.PRUNESUBSUMED;

DEBUG = ref.DEBUG;
QUIET = ref.QUIET;
Expand Down Expand Up @@ -1209,6 +1212,28 @@ class PatternModel: public MapType, public PatternModelInterface {
if (!options.QUIET) std::cerr << " pruned " << prunednonsubsumed << " non-subsumed " << (n-1) << "-grams" << std::endl;
}
}
if (options.PRUNESUBSUMED) {
if (!options.QUIET) std::cerr << "Pruning subsumed n-grams" << std::endl;
int end_n = options.PRUNESUBSUMED;
if ((end_n > options.MAXLENGTH)) end_n = options.MAXLENGTH;
for (int n = 2; n <= end_n; n++) {
std::unordered_set<Pattern> subsumed;
unsigned int prunedsubsumed = 0;
PatternModel::iterator iter = this->begin();
while (iter != this->end()) {
const unsigned int pattern_n = iter->first.n();
if (pattern_n == (unsigned int) n) {
subngrams.clear();
iter->first.ngrams(subngrams, n-1);
for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) subsumed.insert(Pattern(*iter2));
}
iter++;
};
prunedsubsumed += this->pruneinset(subsumed, n-1);
if (!options.QUIET) std::cerr << " pruned " << prunedsubsumed << " subsumed " << (n-1) << "-grams" << std::endl;
}

}
if ((options.MINLENGTH > 1) && (options.DOSKIPGRAMS || options.DOSKIPGRAMS_EXHAUSTIVE)) {
unsigned int pruned = this->prunebylength(options.MINLENGTH-1);
if (!options.QUIET) std::cerr << " pruned " << pruned << " patterns below minimum length (" << options.MINLENGTH << ")" << std::endl;
Expand Down Expand Up @@ -2081,6 +2106,34 @@ class PatternModel: public MapType, public PatternModelInterface {
return pruned;
}

/**
* Prune all patterns that are in the specified set.
* @param s The set containing the patterns not to prune
* @param _n The size constraint, limit to patterns of this size only (set to 0 for no constraint, default)
* @return the number of distinct patterns pruned
*/
unsigned int pruneinset(const std::unordered_set<Pattern> & s, int _n) {
unsigned int pruned = 0;
if (s.empty()) {
return pruned;
}
PatternModel::iterator iter = this->begin();
while (iter != this->end()) {
const PatternType pattern = iter->first;
if ( (_n == 0) || (pattern.n() == (unsigned int) _n) ) {
if (s.find(pattern) != s.end()) {
//found in set
iter = this->erase(iter);
pruned++;
continue;
}
}
iter++;
};

return pruned;
}

/**
* Prune all patterns that are not in the second model
* @return the number of distinct patterns pruned
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -183,7 +183,7 @@ def read(fname):
license = "GPL",
keywords = "nlp computational_linguistics frequency ngram skipgram pmi cooccurrence linguistics",
long_description=read('README.rst'),
version = '2.5.3',
version = '2.5.4',
ext_modules = extensions,
cmdclass = {'build_ext': build_ext},
classifiers=[
Expand Down
1 change: 1 addition & 0 deletions src/patternmodeller.cpp
Expand Up @@ -47,6 +47,7 @@ void usage() {
cerr << "\t-W|--wordthreshold <number> Word occurrence threshold (secondary threshold): only count patterns in which the words/unigrams occur at least this many times, only effective when the primary " << endl;
cerr << "\t occurrence threshold (-t) is lower than this threshold (default: disabled)" << endl;
cerr << "\t-p|--prune <number> Prune all lower-order n-grams below the specified order that are *NOT* subsumed by higher order n-grams (default: 0, disabled). Only effective when used with -l, usually set to equal values" << endl;
cerr << "\t --prunesubsumed <number> Prune all lower-order n-grams below the specified order that are subsumed by higher order n-grams (default: 0, disabled). Only effective when used with -l, usually set to equal values" << endl;
cerr << "\t-s|--skipgrams Compute skipgrams (costs extra memory and time)" << endl;
cerr << "\t-y|--skipthreshold <number> Occurrence threshold for skipgrams (overrides -t for skipgrams, defaults to -t). Skipgrams occurring less than this will be pruned. Value must be equal to or higher than -t." << endl;
cerr << "\t-T|--skiptypes <number> Skip type threshold (for use with -s): only skipgrams with at least x possible types for the skip will be considered, otherwise the skipgram " << endl;
Expand Down

0 comments on commit 33957ed

Please sign in to comment.