added chunking to Similarity

Dieterbe · Jun 13, 2011 · 482c73f · 482c73f
1 parent 4c5cf51
commit 482c73f
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 14 deletions.
diff --git a/src/gensim/corpora/dictionary.py b/src/gensim/corpora/dictionary.py
@@ -49,7 +49,7 @@ def __init__(self, documents=None):
 
     def __getitem__(self, tokenid):
         if len(self.id2token) != len(self.token2id):
-            # the word->id mapping has changed (presumably via addDocuments);
+            # the word->id mapping has changed (presumably via add_documents);
             # recompute id->word accordingly
             self.id2token = dict((v, k) for k, v in self.token2id.iteritems())
         return self.id2token[tokenid] # will throw for non-existent ids
@@ -84,7 +84,7 @@ def add_documents(self, documents):
         This is only a convenience wrapper for calling `doc2bow` on each document
         with `allow_update=True`.
 
-        >>> print Dictionary.fromDocuments(["máma mele maso".split(), "ema má máma".split()])
+        >>> print Dictionary(["máma mele maso".split(), "ema má máma".split()])
         Dictionary(5 unique tokens)
         """
         for docno, document in enumerate(documents):

diff --git a/src/gensim/models/ldamodel.py b/src/gensim/models/ldamodel.py
@@ -145,7 +145,7 @@ class LdaModel(interfaces.TransformationABC):
     The constructor estimates Latent Dirichlet Allocation model parameters based
     on a training corpus:
 
-    >>> lda = LdaModel(corpus, numTopics=10)
+    >>> lda = LdaModel(corpus, num_topics=10)
 
     You can then infer topic distributions on new, unseen documents, with
 
@@ -168,7 +168,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False,
         printing.
 
         `alpha` and `eta` are hyperparameters on document-topic (theta) and
-        topic-word (lambda) distributions. Both default to a symmetric 1.0/numTopics
+        topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics
         (but can be set to a vector, for assymetric priors).
 
         Turn on `distributed` to force distributed computing (see the web tutorial
@@ -254,7 +254,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False,
     def setstate(self, state, compute_diff=False):
         """
         Reset word-topic mixtures lambda (and beta) using collected counts of
-        sufficient statistics (a `numTopics x numTerms` matrix).
+        sufficient statistics (a `num_topics x num_terms` matrix).
 
         Return the aggregate amount of change in topics, log(old_lambda-new_lambda).
         """

diff --git a/src/gensim/models/lsimodel.py b/src/gensim/models/lsimodel.py
@@ -198,7 +198,7 @@ class LsiModel(interfaces.TransformationABC):
     1. constructor, which initializes the projection into latent topics space,
     2. the ``[]`` method, which returns representation of any input document in the
        latent space,
-    3. the `addDocuments()` method, which allows for incrementally updating the model with new documents.
+    3. `add_documents()` for incrementally updating the model with new documents.
 
     Model persistency is achieved via its load/save methods.
 
@@ -216,7 +216,7 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunks=20000,
         LSI transformation is available at any point.
 
         If you specify a `corpus`, it will be used to train the model. See the
-        method `addDocuments` for a description of the `chunks` and `decay` parameters.
+        method `add_documents` for a description of the `chunks` and `decay` parameters.
 
         Turn `onepass` off to force a multi-pass stochastic algorithm.
 

diff --git a/src/gensim/models/rpmodel.py b/src/gensim/models/rpmodel.py
@@ -49,7 +49,7 @@ def __init__(self, corpus, id2word=None, num_topics=300):
 
 
     def __str__(self):
-        return "RpModel(numTerms=%s, numTopics=%s)" % (self.num_terms, self.num_topics)
+        return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics)
 
 
     def initialize(self, corpus):

diff --git a/src/gensim/similarities/docsim.py b/src/gensim/similarities/docsim.py
@@ -96,7 +96,7 @@ class Similarity(interfaces.SimilarityABC):
     The shards themselves are simply stored as files to disk and mmap'ed back as needed.
 
     """
-    def __init__(self, output_prefix, corpus, num_features, num_best=None, shardsize=5000):
+    def __init__(self, output_prefix, corpus, num_features, num_best=None, chunks=512, shardsize=5000):
         """
         Construct the index from `corpus`. The index can be later extended by calling
         the `add_documents` method. Documents are split into shards of `shardsize`
@@ -127,6 +127,7 @@ def __init__(self, output_prefix, corpus, num_features, num_best=None, shardsize
         self.num_features = num_features
         self.num_best = num_best
         self.normalize = True
+        self.chunks = int(chunks)
         self.shardsize = shardsize
         self.shards = []
         self.fresh_docs, self.fresh_nnz = [], 0
@@ -255,12 +256,20 @@ def __iter__(self):
         self.normalize = False
 
         for shard in self.shards:
-            # use the entire shard index as a gigantic query!
+            # split each shard index into smaller chunks (of size self.chunks) and
+            # use each chunk as a query
             query = shard.get_index().index
-            # TODO: or maybe chunk it into smaller pieces? better for memory with
-            # very large indexes (memory = |query| * |index| * 4)
-            for sims in self[query]:
-                yield sims
+            for chunk_start in xrange(0, query.shape[0], self.chunks):
+                # scipy.sparse doesn't allow slicing beyond real size of the matrix
+                # (unlike numpy). so, clip the end of the chunk explicitly to make
+                # scipy.sparse happy
+                chunk_end = min(query.shape[0], chunk_start + self.chunks)
+                chunk = query[chunk_start : chunk_end] # create a view
+                if chunk.shape[0] > 1:
+                    for sim in self[chunk]:
+                        yield sim
+                else:
+                    yield self[chunk]
         self.normalize = norm
 #endclass Similarity