In [1]:
import numpy as np
import pickle
import os

In [2]:
np.random.seed(97)

---

# Summary

For a vector of size `(65000, 2048)`, `float32`, the results are:

|      	|  Pickle  	|   Numpy  	| Memmap 	|
|:----:	|:--------:	|:--------:	|:------:	|
| Save 	|  ~9 sec  	|  ~9 sec  	| ~9 sec 	|
| Load 	| ~3.5 sec 	| ~3.5 sec 	| ~11 ms 	|
| Size 	|  0.66 G  	|  0.66 G  	|  0.5 G 	|

Thus, in this case, it is more optimal to use `np.memmap` to save the vector and read it from disk. 

---

# Experiments 

In [3]:
num_vectors = 65000
dim = 2048

# Pickle

## Float_64

In [4]:
vectors = np.random.random((num_vectors, dim))
path = 'vector_64.pkl'
print(path, vectors.shape, type(vectors.dtype))

vector_64.pkl (65000, 2048) <class 'numpy.dtype[float64]'>


### Save array:

In [5]:
print(f'Pickle save float_64 ({path}):')

Pickle save float_64 (vector_64.pkl):


In [6]:
%%timeit
with open(path, 'wb') as handle:
    pickle.dump(vectors, handle, protocol=2)

30.2 s ± 9.2 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Load array:

In [7]:
print(f'Pickle load float_64 ({path}):')

Pickle load float_64 (vector_64.pkl):


In [8]:
%%timeit
vectors = np.array(pickle.load(open(path, 'rb')), np.float32)

10.4 s ± 2.73 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Array size:

In [9]:
print('Pickle float_64:')
print(f"Size {path}: {round(os.stat(path).st_size / (1024 * 1024 * 1024), 2)} G")

Pickle float_64:
Size vector_64.pkl: 1.49 G


---

## Float_32

In [10]:
vectors = np.random.random((num_vectors, dim))
vectors = np.float32(vectors)
path = 'vector_32.pkl'
print(path, vectors.shape, type(vectors.dtype))

vector_32.pkl (65000, 2048) <class 'numpy.dtype[float32]'>


### Save array:

In [11]:
print(f'Pickle save float_32 ({path}):')

Pickle save float_32 (vector_32.pkl):


In [12]:
%%timeit
with open(path, 'wb') as handle:
    pickle.dump(vectors, handle, protocol=2)

8.66 s ± 861 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Load array:

In [13]:
print(f'Pickle load float_32 ({path}):')

Pickle load float_32 (vector_32.pkl):


In [14]:
%%timeit
vectors = np.array(pickle.load(open(path, 'rb')), np.float32)

3.35 s ± 241 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Array size:

In [15]:
print('Pickle float_32:')
print(f"Size {path}: {round(os.stat(path).st_size / (1024 * 1024 * 1024), 2)} G")

Pickle float_32:
Size vector_32.pkl: 0.66 G


---

# Numpy

In [16]:
vectors = np.random.random((num_vectors, dim))
vectors = np.float32(vectors)
path = 'vector_32.npy'
print(path, vectors.shape, type(vectors.dtype))

vector_32.npy (65000, 2048) <class 'numpy.dtype[float32]'>


### Save array:

In [17]:
print(f'Numpy save float_32 ({path}):')

Numpy save float_32 (vector_32.npy):


In [18]:
%%timeit
vectors.dump(path)

9.31 s ± 1.15 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Load array:

In [19]:
print(f'Numpy load float_32 ({path}):')

Numpy load float_32 (vector_32.npy):


In [20]:
%%timeit
vectors = np.load(path, allow_pickle=True)

4.26 s ± 847 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Array size:

In [21]:
print('Numpy float_32:')
print(f"Size {path}: {round(os.stat(path).st_size / (1024 * 1024 * 1024), 2)} G")

Numpy float_32:
Size vector_32.npy: 0.66 G


---

# Memmap

In [22]:
vectors = np.random.random((num_vectors, dim))
vectors = np.float32(vectors)
path = 'vector_32.dat'
print(path, vectors.shape, type(vectors.dtype))

vector_32.dat (65000, 2048) <class 'numpy.dtype[float32]'>


### Save array:

In [23]:
print(f'Memmap save float_32 ({path}):')

Memmap save float_32 (vector_32.dat):


In [24]:
%%timeit
fp = np.memmap(path, dtype='float32', mode='w+', shape=(num_vectors, dim))
fp[:] = vectors[:]
fp.flush()

8.26 s ± 1.61 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Load array:

In [25]:
print(f'Memmap load float_32 ({path}):')

Memmap load float_32 (vector_32.dat):


In [26]:
%%timeit
newfp = np.memmap(path, dtype='float32', mode='r', shape=(num_vectors, dim))

The slowest run took 59.91 times longer than the fastest. This could mean that an intermediate result is being cached.
11.1 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Array size:

In [27]:
print('Memmap float_32:')
print(f"Size {path}: {round(os.stat(path).st_size / (1024 * 1024 * 1024), 2)} G")

Memmap float_32:
Size vector_32.dat: 0.5 G
