Skip to content

Commit

Permalink
changed default dense chunks size to 256 in indexing (was: 100)
Browse files Browse the repository at this point in the history
* powers of 2 give the best performance, i guess due to better cache alignment
  • Loading branch information
piskvorky committed May 19, 2011
1 parent 5eec39d commit a02ad76
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/gensim/similarities/docsim.py
Expand Up @@ -88,7 +88,7 @@ class MatrixSimilarity(interfaces.SimilarityABC):
The matrix is internally stored as a numpy array.
"""
def __init__(self, corpus, numBest=None, dtype=numpy.float32, numFeatures=None, chunks=100):
def __init__(self, corpus, numBest=None, dtype=numpy.float32, numFeatures=None, chunks=256):
"""
If `numBest` is left unspecified, similarity queries return a full list (one
float for every document in the corpus):
Expand Down
4 changes: 2 additions & 2 deletions src/gensim/test/simspeed.py
Expand Up @@ -50,7 +50,7 @@

logging.info("test 1: similarity of all vs. all (%i documents, %i features)" %
(len(corpus_dense), index_dense.numFeatures))
for chunks in [0, 1, 5, 10, 100, 200, 500, 1000]:
for chunks in [0, 1, 4, 8, 16, 64, 128, 256, 512, 1024]:
index_dense.chunks = chunks
start = time()
# `sims` stores the entire N x N sim matrix in memory!
Expand All @@ -71,7 +71,7 @@

index_dense.numBest = 10
logging.info("test 2: as above, but only ask for top-10 most similar for each document")
for chunks in [0, 1, 5, 10, 100, 200, 500, 1000]:
for chunks in [0, 1, 4, 8, 16, 64, 128, 256, 512, 1024]:
index_dense.chunks = chunks
start = time()
sims = [sim for sim in index_dense]
Expand Down

0 comments on commit a02ad76

Please sign in to comment.