Skip to content

Commit

Permalink
added chunking to Similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Jun 13, 2011
1 parent 4c5cf51 commit 482c73f
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 14 deletions.
4 changes: 2 additions & 2 deletions src/gensim/corpora/dictionary.py
Expand Up @@ -49,7 +49,7 @@ def __init__(self, documents=None):

def __getitem__(self, tokenid):
if len(self.id2token) != len(self.token2id):
# the word->id mapping has changed (presumably via addDocuments);
# the word->id mapping has changed (presumably via add_documents);
# recompute id->word accordingly
self.id2token = dict((v, k) for k, v in self.token2id.iteritems())
return self.id2token[tokenid] # will throw for non-existent ids
Expand Down Expand Up @@ -84,7 +84,7 @@ def add_documents(self, documents):
This is only a convenience wrapper for calling `doc2bow` on each document
with `allow_update=True`.
>>> print Dictionary.fromDocuments(["máma mele maso".split(), "ema má máma".split()])
>>> print Dictionary(["máma mele maso".split(), "ema má máma".split()])
Dictionary(5 unique tokens)
"""
for docno, document in enumerate(documents):
Expand Down
6 changes: 3 additions & 3 deletions src/gensim/models/ldamodel.py
Expand Up @@ -145,7 +145,7 @@ class LdaModel(interfaces.TransformationABC):
The constructor estimates Latent Dirichlet Allocation model parameters based
on a training corpus:
>>> lda = LdaModel(corpus, numTopics=10)
>>> lda = LdaModel(corpus, num_topics=10)
You can then infer topic distributions on new, unseen documents, with
Expand All @@ -168,7 +168,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False,
printing.
`alpha` and `eta` are hyperparameters on document-topic (theta) and
topic-word (lambda) distributions. Both default to a symmetric 1.0/numTopics
topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics
(but can be set to a vector, for assymetric priors).
Turn on `distributed` to force distributed computing (see the web tutorial
Expand Down Expand Up @@ -254,7 +254,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False,
def setstate(self, state, compute_diff=False):
"""
Reset word-topic mixtures lambda (and beta) using collected counts of
sufficient statistics (a `numTopics x numTerms` matrix).
sufficient statistics (a `num_topics x num_terms` matrix).
Return the aggregate amount of change in topics, log(old_lambda-new_lambda).
"""
Expand Down
4 changes: 2 additions & 2 deletions src/gensim/models/lsimodel.py
Expand Up @@ -198,7 +198,7 @@ class LsiModel(interfaces.TransformationABC):
1. constructor, which initializes the projection into latent topics space,
2. the ``[]`` method, which returns representation of any input document in the
latent space,
3. the `addDocuments()` method, which allows for incrementally updating the model with new documents.
3. `add_documents()` for incrementally updating the model with new documents.
Model persistency is achieved via its load/save methods.
Expand All @@ -216,7 +216,7 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunks=20000,
LSI transformation is available at any point.
If you specify a `corpus`, it will be used to train the model. See the
method `addDocuments` for a description of the `chunks` and `decay` parameters.
method `add_documents` for a description of the `chunks` and `decay` parameters.
Turn `onepass` off to force a multi-pass stochastic algorithm.
Expand Down
2 changes: 1 addition & 1 deletion src/gensim/models/rpmodel.py
Expand Up @@ -49,7 +49,7 @@ def __init__(self, corpus, id2word=None, num_topics=300):


def __str__(self):
return "RpModel(numTerms=%s, numTopics=%s)" % (self.num_terms, self.num_topics)
return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics)


def initialize(self, corpus):
Expand Down
21 changes: 15 additions & 6 deletions src/gensim/similarities/docsim.py
Expand Up @@ -96,7 +96,7 @@ class Similarity(interfaces.SimilarityABC):
The shards themselves are simply stored as files to disk and mmap'ed back as needed.
"""
def __init__(self, output_prefix, corpus, num_features, num_best=None, shardsize=5000):
def __init__(self, output_prefix, corpus, num_features, num_best=None, chunks=512, shardsize=5000):
"""
Construct the index from `corpus`. The index can be later extended by calling
the `add_documents` method. Documents are split into shards of `shardsize`
Expand Down Expand Up @@ -127,6 +127,7 @@ def __init__(self, output_prefix, corpus, num_features, num_best=None, shardsize
self.num_features = num_features
self.num_best = num_best
self.normalize = True
self.chunks = int(chunks)
self.shardsize = shardsize
self.shards = []
self.fresh_docs, self.fresh_nnz = [], 0
Expand Down Expand Up @@ -255,12 +256,20 @@ def __iter__(self):
self.normalize = False

for shard in self.shards:
# use the entire shard index as a gigantic query!
# split each shard index into smaller chunks (of size self.chunks) and
# use each chunk as a query
query = shard.get_index().index
# TODO: or maybe chunk it into smaller pieces? better for memory with
# very large indexes (memory = |query| * |index| * 4)
for sims in self[query]:
yield sims
for chunk_start in xrange(0, query.shape[0], self.chunks):
# scipy.sparse doesn't allow slicing beyond real size of the matrix
# (unlike numpy). so, clip the end of the chunk explicitly to make
# scipy.sparse happy
chunk_end = min(query.shape[0], chunk_start + self.chunks)
chunk = query[chunk_start : chunk_end] # create a view
if chunk.shape[0] > 1:
for sim in self[chunk]:
yield sim
else:
yield self[chunk]
self.normalize = norm
#endclass Similarity

Expand Down

0 comments on commit 482c73f

Please sign in to comment.