## A basic notebook to run the pipeline defined in `doc_pipeline.py`.

By default this runs parts of the pipeline in parallel using threads or processes.

To scale processing here look at all the subsequent cells that show how to run on 
 ray or dask. For spark see spark/notebook.ipynb.

In [0]:
import doc_pipeline

from hamilton import driver
from hamilton.execution import executors

dr = (
    driver.Builder()
    .with_modules(doc_pipeline)
    .enable_dynamic_execution(allow_experimental_mode=True)
    .with_config({})
    .with_local_executor(executors.SynchronousLocalTaskExecutor())
    # Choose a backend to process the parallel parts of the pipeline
    .with_remote_executor(executors.MultiThreadingExecutor(max_tasks=5))
    # .with_remote_executor(executors.MultiProcessingExecutor(max_tasks=5))
    .build()
)
dag = dr.display_all_functions()
result = dr.execute(
    ["collect_chunked_url_text"],
    inputs={"chunk_size": 256, "chunk_overlap": 32},
)
# do something with the result...
import pprint

for chunk in result["collect_chunked_url_text"]:
    pprint.pprint(chunk)
dag

# Ray

In [ ]:
import logging

import doc_pipeline
import ray

from hamilton import driver, log_setup
from hamilton.plugins import h_ray

log_setup.setup_logging(logging.INFO)
ray.init()

dr = (
    driver.Builder()
    .with_modules(doc_pipeline)
    .enable_dynamic_execution(allow_experimental_mode=True)
    .with_config({})
    # Choose a backend to process the parallel parts of the pipeline
    # .with_remote_executor(executors.MultiThreadingExecutor(max_tasks=5))
    # .with_remote_executor(executors.MultiProcessingExecutor(max_tasks=5))
    .with_remote_executor(
        h_ray.RayTaskExecutor()
    )  # be sure to run ray.init() or pass in config.
    .build()
)
dag = dr.display_all_functions()
result = dr.execute(
    ["collect_chunked_url_text"],
    inputs={"chunk_size": 256, "chunk_overlap": 32},
)
# do something with the result...
import pprint

for chunk in result["collect_chunked_url_text"]:
    pprint.pprint(chunk)

ray.shutdown()
dag

# Dask

In [ ]:
import logging

import doc_pipeline
from dask import distributed

from hamilton import driver, log_setup
from hamilton.plugins import h_dask

log_setup.setup_logging(logging.INFO)

cluster = distributed.LocalCluster()
client = distributed.Client(cluster)
remote_executor = h_dask.DaskExecutor(client=client)

dr = (
    driver.Builder()
    .with_modules(doc_pipeline)
    .enable_dynamic_execution(allow_experimental_mode=True)
    .with_config({})
    # Choose a backend to process the parallel parts of the pipeline
    # .with_remote_executor(executors.MultiThreadingExecutor(max_tasks=5))
    # .with_remote_executor(executors.MultiProcessingExecutor(max_tasks=5))
    .with_remote_executor(h_dask.DaskExecutor(client=client))
    .build()
)
dag = dr.display_all_functions()
result = dr.execute(
    ["collect_chunked_url_text"],
    inputs={"chunk_size": 256, "chunk_overlap": 32},
)
# do something with the result...
import pprint

for chunk in result["collect_chunked_url_text"]:
    pprint.pprint(chunk)

client.shutdown()
dag