# First-class Caching in Hamilton

In [1]:
import logging

logger = logging.getLogger("hamilton.caching.fingerprinting")
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

%load_ext hamilton.plugins.jupyter_magic

  from .autonotebook import tqdm as notebook_tqdm


# Fingerprinting
Fingerprinting derives an identifier from a data object.

In [2]:
import pandas as pd
from hamilton import driver
from hamilton.caching.fingerprinting import FingerprintingAdapter

In [3]:
%%cell_to_module primitives
import pandas as pd

def A() -> int:
    return 7

def B(A: int) -> float:
    return float(A)

def C(B: float, D: bool = True) -> bool:
    return B != D

def E(C: bool) -> str:
    return "hello-world"

def F(E: str) -> dict:
    return {E: E*3}

def G() -> pd.DataFrame:
    return pd.DataFrame({"a": [323, 3235], "b": ["hello", "vorld"]})

In [4]:
fingerprint_adapter = FingerprintingAdapter()
dr = (
    driver.Builder()
    .with_modules(primitives)
    .with_adapters(fingerprint_adapter)
    .build()
)
dr.execute(["A", "B", "C", "D", "E", "F", "G"])

fingerprint_adapter.run_fingerprints

{'A': 'jxTkX87qFnpaNt7dS-olQw==',
 'B': 'Ad7r-oe5OMieyz3LTv_L4g==',
 'C': '-CfPRi9ihI3zfF4elKTadA==',
 'E': 'IJUxIYl1PeatR9_iDL6X7A==',
 'F': 'eOXd-WuG4ZFy8yz1O9NH5uzyw1wzWbsypyuNNw==',
 'G': 'TCtsLwovEJ2EIAYt60Q4_w36aBdTnSXVNPQpsA=='}

# Caching
In simple terms, an effective caching feature should guarantee "for the same inputs and code, if this was previously computed, read the stored values instead of recomputing".

In practice, this requires multiple decoupled parts:

- **Execution context**: When executing a single node, it uses a specific code version and specific input values. NOTE. the node name is irrelevant to the execution context. This is important for parameterize where multiple parameterization might lead to the same results but different node names.
- **Fingerprinting strategy / function**: the function to determine a fingerprint (default is recursive primitive hashing). An example alternative fingerprinting strategy could be to hash the index of a dataframe rather than hash its row content (user is responsible to guarantee unique ids for each row)
- **Store**: a generic term for data storage
- **Repository**: store for fingerprints and cache metadata. Maps `{execution_context: fingerprint}`. This is the core piece to avoid having to recompute expensive fingerprints on each run. Also, this is not human-readable by default, but the execution_context key can be parsed
- **Cache**: store for node results. It's a `{fingerprint: data}` that doesn't know about anything else. The choice of storage affects latency, parallelism, fault tolerance, etc.
- **Caching strategy / algorithm**: handles behaviors to decide to compute a node and try to read/write usign the cache

In [5]:
from hamilton.caching import caching

cache_adapter = caching.SmartCache()
dr = (
    driver.Builder()
    .with_modules(primitives)
    .with_adapters(cache_adapter)
    .build()
)
results = dr.execute(["A", "B", "C", "D", "E", "F", "G"])

print(results)
cache_adapter.run_fingerprints

{'A': 7, 'B': 7.0, 'C': True, 'D': None, 'E': 'hello-world', 'F': {'hello-world': 'hello-worldhello-worldhello-world'}, 'G':       a      b
0   323  hello
1  3235  vorld}


{'A': 'jxTkX87qFnpaNt7dS-olQw==',
 'B': 'Ad7r-oe5OMieyz3LTv_L4g==',
 'C': '-CfPRi9ihI3zfF4elKTadA==',
 'E': 'IJUxIYl1PeatR9_iDL6X7A==',
 'F': 'eOXd-WuG4ZFy8yz1O9NH5uzyw1wzWbsypyuNNw==',
 'G': 'TCtsLwovEJ2EIAYt60Q4_w36aBdTnSXVNPQpsA=='}

In [6]:
# get the store from adapter
result_store = cache_adapter.result_store
result_store.open()

node_name = "F"
# get the fingerprint from the latest run
fingerprint = cache_adapter.run_fingerprints.get(node_name)
# get the value from the result_store
# NOTE `node_name` is currently required but unused
result = result_store.get(key=fingerprint, node_name=node_name)
print(result)

result_store.close()

{'hello-world': 'hello-worldhello-worldhello-world'}


In [7]:
# get the store from adapter
fingerprint_store = cache_adapter.fingerprint_store
fingerprint_store.open()

node_name = "F"
# get the code version from the current driver
code_version = cache_adapter.code_versions[node_name]
print(f"{code_version=:}", "\n")
# gather the fingerprints for its inputs, which are known before executing `F`
node_inputs = ["E"]
inputs_fingerprints = {
    i: cache_adapter.run_fingerprints[i] for i in node_inputs
}
print(f"{inputs_fingerprints=:}", "\n")
# encode these fingerprints has a dictionary (reversible operation)
inputs_encoded = caching.encode_dict(inputs_fingerprints)
print(f"{inputs_encoded=:}", "\n")
# recreate the "execution context key"
context = {code_version: inputs_encoded}
print(f"{context=:}", "\n")
context_encoded = caching.encode_dict(context)
print(f"{context_encoded=:}")

# NOTE `node_name` is currently required but unused
retrieved_fingerprint = fingerprint_store.get(key=context_encoded, node_name=node_name)
print(f"{retrieved_fingerprint=:}", "\n")

print(retrieved_fingerprint==fingerprint, "\n")

fingerprint_store.close()

code_version=a5dba266c921b22973b9c21047c98add69a902c49924e647a750c8956dcfa7e2 

inputs_fingerprints={'E': 'IJUxIYl1PeatR9_iDL6X7A=='} 

inputs_encoded=eF5zVfD0Cq3wjMwxDEhNLAmyjM908TGLMHe0tQUAbV0H8g== 

context={'a5dba266c921b22973b9c21047c98add69a902c49924e647a750c8956dcfa7e2': 'eF5zVfD0Cq3wjMwxDEhNLAmyjM908TGLMHe0tQUAbV0H8g=='} 

context_encoded=eF4FwUEKwjAQBdCreITpNJnkL7ooVu2iEQTtfjJJlYILQah6et9TX7KyiIGbzIzQZhg35IIhaikCBbE5gF0VFzR4sggvxRYNlXf16H/zMtD+1W5r2j7D4XGe+ud3TaB4PU1prPS+3Po80xjvXfcHAdkgog==
retrieved_fingerprint=eOXd-WuG4ZFy8yz1O9NH5uzyw1wzWbsypyuNNw== 

True 



# API Design

```python

dr = (
    driver.Builder()
    .with_cache()
    .build()
)
```


## Interface
```python
# ... Builder
def with_cache(
    self,
    fingerprint: Optional[Callable] = None
    fingerprint_store: Optional["Repository"] = None
    data_repo: Optional["Repository"] = None
    ..., # other behavior kwargs like "recompute", "skip", etc.
):
    from hamilton.caching import caching
    from hamilton.caching import fingerprinting
    from hamilton.caching import repository

    if fingerprint:
        fingerprint = fingerprint
    else:
        fingerprint = fingerprinting.hash_value

    # "fingerprint repository" could also be named "the repository" i.e., metadata
    # used to manage data storage / "the cache"
    # dbm uses {str | bytes: str | bytes} on disk
    fingerprint_store = (
        fingerprint_store if fingerprint_store else repository.dbmRepository()
    )

    # "result store" could also be named "the cache" i.e., where data is stored
    # shelve uses {str | bytes: pickle} on disk
    result_store = result_store if result_store else repository.ShelveRepository()

    adapter = adapters.CacheAdapter(
        fingerprint=fingerprint,
        fingerprint_repo=fingerprint_repo,
        data_repo=data_repo,
        ..., 
    )
    self.adapters.append(adapter)

```