In [1]:
import os

import dill
import numpy as np

# local module
import py_utils.data_repo as data_repo

---
writing time comparison: numpy vs dill

In [2]:
# hundred million float64
foo = np.random.rand(1000000, 100)

# estimate how many megabytes will take in local file system
num_bits = 1e8 * 64
num_bytes = num_bits / 8.0
num_mb = num_bytes / 2**20
num_mb

762.939453125

In [3]:
%%time
for i in range(10):
    np.save('foo.npy', foo)

CPU times: user 7.23 ms, sys: 3.07 s, total: 3.08 s
Wall time: 3.1 s


In [4]:
%%time
for i in range(10):
    with open('foo.dill', 'wb') as fd:
        dill.dump(foo, fd)

CPU times: user 642 ms, sys: 5.74 s, total: 6.38 s
Wall time: 10.8 s


In [5]:
ll -h | grep foo

-rw-rw-r-- 1 chad 763M Mar 18 12:32 foo.dill
-rw-rw-r-- 1 chad 763M Mar 18 12:32 foo.npy


In [6]:
os.remove("foo.npy")
os.remove("foo.dill")

---

In [7]:
os.mkdir("temp")
repo = data_repo.DataRepositoryObserver("temp")

In [8]:
foo = {"A": 1, "B": [1, 2, 3], "C": lambda x: x**2, "D": np.random.rand(3, 3)}

In [9]:
repo.add_item("foo.1", foo, version=1, include_function=True)

In [10]:
repo.list_items()

{'foo.1.dill': {'version': 1, 'include_function': True}}

In [11]:
repo.add_item("foo.2", foo, version=2, include_function=True)
repo.add_item("foo.3", foo, version=4, include_function=True)
repo.add_item("foo.4", foo, version=3, include_function=True)

In [12]:
repo.list_items()

{'foo.1.dill': {'version': 1, 'include_function': True},
 'foo.2.dill': {'version': 2, 'include_function': True},
 'foo.3.dill': {'version': 4, 'include_function': True},
 'foo.4.dill': {'version': 3, 'include_function': True}}

In [13]:
def filter_by(name, attr):
    if "version" not in attr:
        return False

    if attr["version"] > 3:
        return False

    return True


repo.list_items(filter_by=filter_by)

{'foo.1.dill': {'version': 1, 'include_function': True},
 'foo.2.dill': {'version': 2, 'include_function': True},
 'foo.4.dill': {'version': 3, 'include_function': True}}

In [14]:
obj = repo.get_item("foo.1.dill")
attr = repo.get_attribute("foo.1.dill")
attr

{'version': 1, 'include_function': True}

In [15]:
obj

{'A': 1,
 'B': [1, 2, 3],
 'C': <function __main__.<lambda>(x)>,
 'D': array([[0.63352109, 0.73267988, 0.34943828],
        [0.13278247, 0.91478017, 0.55377615],
        [0.79264813, 0.85459829, 0.23482393]])}

In [16]:
obj["C"](3)

9

In [17]:
foo = np.random.rand(10, 3)
repo.add_item("foo.1", foo, version=1, include_function=False)
repo.add_item("foo.2", foo, version=1, include_function=False)
repo.add_item("foo.3", foo, version=1, include_function=False)

In [18]:
repo.list_items()

{'foo.1.dill': {'version': 1, 'include_function': True},
 'foo.2.dill': {'version': 2, 'include_function': True},
 'foo.3.dill': {'version': 4, 'include_function': True},
 'foo.4.dill': {'version': 3, 'include_function': True},
 'foo.1.npy': {'version': 1, 'include_function': False},
 'foo.2.npy': {'version': 1, 'include_function': False},
 'foo.3.npy': {'version': 1, 'include_function': False}}

In [19]:
rm -rf ./temp