In [3]:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
from apache_beam.ml.inference.base import KeyedModelHandler, RunInference
from apache_beam.dataframe.convert import to_dataframe

import apache_beam.runners.interactive.interactive_beam as ib

from modelhandler import SpacyModelHandler

In [4]:
text_strings_with_keys = [
    ("example_0", "The New York Times is an American daily newspaper based in New York City with a worldwide readership."),
    ("example_1", "It was founded in 1851 by Henry Jarvis Raymond and George Jones, and was initially published by Raymond, Jones & Company.")
]

pipeline = beam.Pipeline(InteractiveRunner())

keyed_spacy_model_handler = KeyedModelHandler(SpacyModelHandler("en_core_web_sm"))

In [5]:
with pipeline as p:
    results = (p 
    | "CreateSentences" >> beam.Create(text_strings_with_keys)
    | "RunInferenceSpacy" >> RunInference(keyed_spacy_model_handler)
    # Generate a schema suitable for conversion to a dataframe using Map to Row objects.
    | 'ToRows' >> beam.Map(lambda row: beam.Row(key=row[0], text=row[1][0], predictions=row[1][1]))
    )

beam_df = to_dataframe(results)
df = ib.collect(beam_df)

In [6]:
df

Unnamed: 0,key,text,predictions
0,example_0,The New York Times is an American daily newspa...,"[(The New York Times, 0, 18, ORG), (American, ..."
1,example_1,It was founded in 1851 by Henry Jarvis Raymond...,"[(1851, 18, 22, DATE), (Henry Jarvis Raymond, ..."
