changed default dense chunks size to 256 in indexing (was: 100)

* powers of 2 give the best performance, i guess due to better cache alignment
Dieterbe · May 19, 2011 · a02ad76 · a02ad76
1 parent 5eec39d
commit a02ad76
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/src/gensim/similarities/docsim.py b/src/gensim/similarities/docsim.py
@@ -88,7 +88,7 @@ class MatrixSimilarity(interfaces.SimilarityABC):
 
     The matrix is internally stored as a numpy array.
     """
-    def __init__(self, corpus, numBest=None, dtype=numpy.float32, numFeatures=None, chunks=100):
+    def __init__(self, corpus, numBest=None, dtype=numpy.float32, numFeatures=None, chunks=256):
         """
         If `numBest` is left unspecified, similarity queries return a full list (one
         float for every document in the corpus):

diff --git a/src/gensim/test/simspeed.py b/src/gensim/test/simspeed.py
@@ -50,7 +50,7 @@
 
     logging.info("test 1: similarity of all vs. all (%i documents, %i features)" %
                  (len(corpus_dense), index_dense.numFeatures))
-    for chunks in [0, 1, 5, 10, 100, 200, 500, 1000]:
+    for chunks in [0, 1, 4, 8, 16, 64, 128, 256, 512, 1024]:
         index_dense.chunks = chunks
         start = time()
         # `sims` stores the entire N x N sim matrix in memory!
@@ -71,7 +71,7 @@
 
     index_dense.numBest = 10
     logging.info("test 2: as above, but only ask for top-10 most similar for each document")
-    for chunks in [0, 1, 5, 10, 100, 200, 500, 1000]:
+    for chunks in [0, 1, 4, 8, 16, 64, 128, 256, 512, 1024]:
         index_dense.chunks = chunks
         start = time()
         sims = [sim for sim in index_dense]