# Caching Nodes with Hamilton

In [1]:
import pandas as pd


In [2]:
# Data Loaders
# When developing, we would likely want to cache our data loaders because of latencies in loading data from networked sources or slow disks.
# Functions marked by `tag(cache="SERIALIZATION_FORMAT")` are automatically cached by the CachingGraphAdapter (discussed later).

from hamilton.function_modifiers import tag

spends_data = [10, 10, 20, 40, 40, 50]
signups_data = [1, 10, 50, 100, 200, 400]

@tag(cache="parquet")
def spend() -> pd.Series:
    """Emulates potentially expensive data extraction."""
    return pd.Series(spends_data)


@tag(cache="parquet")
def signups() -> pd.Series:
    """Emulates potentially expensive data extraction."""
    return pd.Series(signups_data)


In [3]:
# Functions holding bussiness logic


def avg_3wk_spend(spend: pd.Series) -> pd.Series:
    """Rolling 3 week average spend."""
    return spend.rolling(3).mean()


def spend_per_signup(spend: pd.Series, signups: pd.Series) -> pd.Series:
    """The cost per signup in relation to spend."""
    return spend / signups


def spend_mean(spend: pd.Series) -> float:
    """Shows function creating a scalar. In this case it computes the mean of the entire column."""
    return spend.mean()


def spend_zero_mean(spend: pd.Series, spend_mean: float) -> pd.Series:
    """Shows function that takes a scalar. In this case to zero mean spend."""
    return spend - spend_mean


def spend_std_dev(spend: pd.Series) -> float:
    """Function that computes the standard deviation of the spend column."""
    return spend.std()


def spend_zero_mean_unit_variance(spend_zero_mean: pd.Series, spend_std_dev: float) -> pd.Series:
    """Function showing one way to make spend have zero mean and unit variance."""
    return spend_zero_mean / spend_std_dev


In [4]:
# Place the functions into a temporary module -- the idea is that this should house a curated set of functions.
# Don't be afraid to make multiple of them -- however we'd advise you to not use this method for production.
# Also note, that using a temporary function module does not work for scaling onto Ray, Dask, or Pandas on Spark.
from hamilton import ad_hoc_utils


data_loaders = ad_hoc_utils.create_temporary_module(
    spend, signups, module_name="data_loaders"
)

business_logic = ad_hoc_utils.create_temporary_module(
    avg_3wk_spend, 
    spend_per_signup,
    spend_mean,
    spend_zero_mean,
    spend_std_dev,
    spend_zero_mean_unit_variance, 
    module_name="business_logic"
)

In [5]:
from hamilton import base, driver
from hamilton.experimental import h_cache
import pathlib
import sys

In [6]:
# This is empty, we get the data from the data_loaders module
initial_columns = {}

# Initialise the cache directory
cache_path = "tmp"
pathlib.Path(cache_path).mkdir(exist_ok=True)

adapter = h_cache.CachingGraphAdapter(cache_path, base.PandasDataFrameResult())
dr = driver.Driver(initial_columns, business_logic, data_loaders, adapter=adapter)
output_columns = [
    "spend",
    "signups",
    "avg_3wk_spend",
    "spend_per_signup",
    "spend_zero_mean_unit_variance",
]

df = dr.execute(output_columns)
print(df.to_string())

Note: Hamilton collects completely anonymous data about usage. This will help us improve Hamilton over time. See https://github.com/dagworks-inc/hamilton#usage-analytics--data-privacy for details.


   spend  signups  avg_3wk_spend  spend_per_signup  spend_zero_mean_unit_variance
0     10        1            NaN            10.000                      -1.064405
1     10       10            NaN             1.000                      -1.064405
2     20       50      13.333333             0.400                      -0.483821
3     40      100      23.333333             0.400                       0.677349
4     40      200      33.333333             0.200                       0.677349
5     50      400      43.333333             0.125                       1.257934


In [None]:
# Lets change the source values for our data loaders. 

spends_data = [i * 1000 for i in spends_data]
signups_data = [i * 1000 for i in spends_data]

In [7]:
# Since the data loaders are cached, they should continue returning the old values.

spends_data = [i * 1000 for i in spends_data]
signups_data = [i * 1000 for i in spends_data]

# CachingGraphAdapter handles the actual caching during exection.
adapter = h_cache.CachingGraphAdapter(cache_path, base.PandasDataFrameResult())

# Hamilton caches are valid accross new instances of the driver. 
dr = driver.Driver(initial_columns, business_logic, data_loaders, adapter=adapter)
output_columns = [
    "spend",
    "signups",
    "avg_3wk_spend",
    "spend_per_signup",
    "spend_zero_mean_unit_variance",
]

df = dr.execute(output_columns)
print(df.to_string())

   spend  signups  avg_3wk_spend  spend_per_signup  spend_zero_mean_unit_variance
0     10        1            NaN            10.000                      -1.064405
1     10       10            NaN             1.000                      -1.064405
2     20       50      13.333333             0.400                      -0.483821
3     40      100      23.333333             0.400                       0.677349
4     40      200      33.333333             0.200                       0.677349
5     50      400      43.333333             0.125                       1.257934


In [9]:
# Now lets force hamilton to recompute the cached data loaders.

adapter = h_cache.CachingGraphAdapter(cache_path, base.PandasDataFrameResult(), force_compute=set(["spend", "signups"]))
dr = driver.Driver(initial_columns, business_logic, data_loaders, adapter=adapter)
output_columns = [
    "spend",
    "signups",
    "avg_3wk_spend",
    "spend_per_signup",
    "spend_zero_mean_unit_variance",
]

df = dr.execute(output_columns)
print(df.to_string())

   spend   signups  avg_3wk_spend  spend_per_signup  spend_zero_mean_unit_variance
0  10000  10000000            NaN             0.001                      -1.064405
1  10000  10000000            NaN             0.001                      -1.064405
2  20000  20000000   13333.333333             0.001                      -0.483821
3  40000  40000000   23333.333333             0.001                       0.677349
4  40000  40000000   33333.333333             0.001                       0.677349
5  50000  50000000   43333.333333             0.001                       1.257934
