From 33957eddbeae7c11ef5cf33cae0846363c9517c8 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Fri, 10 Apr 2020 20:12:46 +0200 Subject: [PATCH] implemented pruning of subsumed ngrams --- colibricore_classes.in.pxd | 1 + colibricore_wrapper.in.pyx | 7 ++++- configure.ac | 2 +- include/patternmodel.h | 57 ++++++++++++++++++++++++++++++++++++-- setup.py | 2 +- src/patternmodeller.cpp | 1 + 6 files changed, 65 insertions(+), 5 deletions(-) diff --git a/colibricore_classes.in.pxd b/colibricore_classes.in.pxd index 92164000..ad1c6f05 100644 --- a/colibricore_classes.in.pxd +++ b/colibricore_classes.in.pxd @@ -331,6 +331,7 @@ cdef extern from "patternmodel.h": bool DOREMOVEFLEXGRAMS bool DORESET int PRUNENONSUBSUMED + int PRUNESUBSUMED cdef cppclass IndexedDataHandler: unsigned int count(IndexedData &) diff --git a/colibricore_wrapper.in.pyx b/colibricore_wrapper.in.pyx index e5711182..c76c4f53 100644 --- a/colibricore_wrapper.in.pyx +++ b/colibricore_wrapper.in.pyx @@ -899,7 +899,8 @@ cdef class PatternModelOptions: * DOREMOVESKIPGRAMS - Remove skipgrams from the model * DOREMOVEFLEXGRAMS - Remove flexgrams from the model * DORESET - Reset all counts before training - * PRUNENONSUBSUMED - Prune all n-grams up to this length that are not subsumed by higher-order ngrams + * PRUNENONSUBSUMED - Prune all n-grams up to this length that are *NOT* subsumed by higher-order ngrams + * PRUNESUBSUMED - Prune all n-grams up to this length that are subsumed by higher-order ngrams * DEBUG * QUIET (default: False) @@ -950,6 +951,8 @@ cdef class PatternModelOptions: self.coptions.QUIET = value elif key == 'PRUNENONSUBSUMED': self.coptions.PRUNENONSUBSUMED = value + elif key == 'PRUNESUBSUMED': + self.coptions.PRUNESUBSUMED = value else: raise KeyError @@ -990,6 +993,8 @@ cdef class PatternModelOptions: return self.coptions.QUIET elif key == 'PRUNENONSUBSUMED': return self.coptions.PRUNENONSUBSUMED + elif key == 'PRUNESUBSUMED': + return self.coptions.PRUNESUBSUMED else: raise KeyError diff --git a/configure.ac b/configure.ac index 07eec28a..6c572ef6 100644 --- a/configure.ac +++ b/configure.ac @@ -4,7 +4,7 @@ # $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/configure.ac $ AC_PREREQ([2.67]) -AC_INIT([colibri-core],[2.5.3],[proycon@anaproy.nl]) +AC_INIT([colibri-core],[2.5.4],[proycon@anaproy.nl]) AC_CONFIG_SRCDIR([configure.ac]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_HEADER([config.h]) diff --git a/include/patternmodel.h b/include/patternmodel.h index abdaa574..282df9ce 100644 --- a/include/patternmodel.h +++ b/include/patternmodel.h @@ -141,7 +141,8 @@ class PatternModelOptions { bool DOREVERSEINDEX; ///< Obsolete now, only here for backward-compatibility with v1 bool DOPATTERNPERLINE; ///< Assume each line contains one integral pattern, rather than actively extracting all subpatterns on a line (default: false) - int PRUNENONSUBSUMED; //< Prune all n-grams that are not subsumed by higher-order ngrams + int PRUNENONSUBSUMED; //< Prune all n-grams that are **NOT** subsumed by higher-order ngrams + int PRUNESUBSUMED; //< Prune all n-grams that are subsumed by higher-order ngrams bool DOREMOVEINDEX; ///< Do not load index information (for indexed models), loads just the patterns without any counts bool DOREMOVENGRAMS; ///< Remove n-grams from the model upon loading it @@ -178,7 +179,8 @@ class PatternModelOptions { DOREMOVESKIPGRAMS = false; DOREMOVEFLEXGRAMS = false; - PRUNENONSUBSUMED = false; + PRUNENONSUBSUMED = 0; + PRUNESUBSUMED = 0; DEBUG = false; QUIET = false; @@ -210,6 +212,7 @@ class PatternModelOptions { DOREMOVEFLEXGRAMS = ref.DOREMOVEFLEXGRAMS; PRUNENONSUBSUMED = ref.PRUNENONSUBSUMED; + PRUNESUBSUMED = ref.PRUNESUBSUMED; DEBUG = ref.DEBUG; QUIET = ref.QUIET; @@ -1209,6 +1212,28 @@ class PatternModel: public MapType, public PatternModelInterface { if (!options.QUIET) std::cerr << " pruned " << prunednonsubsumed << " non-subsumed " << (n-1) << "-grams" << std::endl; } } + if (options.PRUNESUBSUMED) { + if (!options.QUIET) std::cerr << "Pruning subsumed n-grams" << std::endl; + int end_n = options.PRUNESUBSUMED; + if ((end_n > options.MAXLENGTH)) end_n = options.MAXLENGTH; + for (int n = 2; n <= end_n; n++) { + std::unordered_set subsumed; + unsigned int prunedsubsumed = 0; + PatternModel::iterator iter = this->begin(); + while (iter != this->end()) { + const unsigned int pattern_n = iter->first.n(); + if (pattern_n == (unsigned int) n) { + subngrams.clear(); + iter->first.ngrams(subngrams, n-1); + for (std::vector::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) subsumed.insert(Pattern(*iter2)); + } + iter++; + }; + prunedsubsumed += this->pruneinset(subsumed, n-1); + if (!options.QUIET) std::cerr << " pruned " << prunedsubsumed << " subsumed " << (n-1) << "-grams" << std::endl; + } + + } if ((options.MINLENGTH > 1) && (options.DOSKIPGRAMS || options.DOSKIPGRAMS_EXHAUSTIVE)) { unsigned int pruned = this->prunebylength(options.MINLENGTH-1); if (!options.QUIET) std::cerr << " pruned " << pruned << " patterns below minimum length (" << options.MINLENGTH << ")" << std::endl; @@ -2081,6 +2106,34 @@ class PatternModel: public MapType, public PatternModelInterface { return pruned; } + /** + * Prune all patterns that are in the specified set. + * @param s The set containing the patterns not to prune + * @param _n The size constraint, limit to patterns of this size only (set to 0 for no constraint, default) + * @return the number of distinct patterns pruned + */ + unsigned int pruneinset(const std::unordered_set & s, int _n) { + unsigned int pruned = 0; + if (s.empty()) { + return pruned; + } + PatternModel::iterator iter = this->begin(); + while (iter != this->end()) { + const PatternType pattern = iter->first; + if ( (_n == 0) || (pattern.n() == (unsigned int) _n) ) { + if (s.find(pattern) != s.end()) { + //found in set + iter = this->erase(iter); + pruned++; + continue; + } + } + iter++; + }; + + return pruned; + } + /** * Prune all patterns that are not in the second model * @return the number of distinct patterns pruned diff --git a/setup.py b/setup.py index 0e69b25a..f94e2a4c 100755 --- a/setup.py +++ b/setup.py @@ -183,7 +183,7 @@ def read(fname): license = "GPL", keywords = "nlp computational_linguistics frequency ngram skipgram pmi cooccurrence linguistics", long_description=read('README.rst'), - version = '2.5.3', + version = '2.5.4', ext_modules = extensions, cmdclass = {'build_ext': build_ext}, classifiers=[ diff --git a/src/patternmodeller.cpp b/src/patternmodeller.cpp index 0db4f19f..679e6d73 100644 --- a/src/patternmodeller.cpp +++ b/src/patternmodeller.cpp @@ -47,6 +47,7 @@ void usage() { cerr << "\t-W|--wordthreshold Word occurrence threshold (secondary threshold): only count patterns in which the words/unigrams occur at least this many times, only effective when the primary " << endl; cerr << "\t occurrence threshold (-t) is lower than this threshold (default: disabled)" << endl; cerr << "\t-p|--prune Prune all lower-order n-grams below the specified order that are *NOT* subsumed by higher order n-grams (default: 0, disabled). Only effective when used with -l, usually set to equal values" << endl; + cerr << "\t --prunesubsumed Prune all lower-order n-grams below the specified order that are subsumed by higher order n-grams (default: 0, disabled). Only effective when used with -l, usually set to equal values" << endl; cerr << "\t-s|--skipgrams Compute skipgrams (costs extra memory and time)" << endl; cerr << "\t-y|--skipthreshold Occurrence threshold for skipgrams (overrides -t for skipgrams, defaults to -t). Skipgrams occurring less than this will be pruned. Value must be equal to or higher than -t." << endl; cerr << "\t-T|--skiptypes Skip type threshold (for use with -s): only skipgrams with at least x possible types for the skip will be considered, otherwise the skipgram " << endl;