Side-Inputs

In [1]:
import apache_beam as beam

# Define a function to count occurrences of target words in a given sentence
def count_words(element, target_words):
    # Initialize a dictionary to keep count of each target word
    word_counts = {word: 0 for word in target_words}
    
    # Split the input sentence into individual words
    for word in element.split():
        # Check if the current word is in the target words
        if word in word_counts:
            # If it is, increment the count for that word
            word_counts[word] += 1
    
    # Return a list of tuples containing the word and its count
    # Only include words that have a count greater than 0
    return [(word, count) for word, count in word_counts.items() if count > 0]

# Define the main pipeline
with beam.Pipeline() as pipeline:
    # List of sentences to process
    sentences = [
        'Strawberry is a delicious fruit',
        'Carrot is a healthy vegetable',
        'I love eating Strawberry and Carrot',
        'Potato is also a versatile vegetable',
        'Strawberry, Carrot, and Potato are popular ingredients'
    ]

    # Define the target words as a side input
    target_words = beam.pvalue.AsList(pipeline | 'Create target words' >> beam.Create(['Strawberry', 'Carrot', 'Potato']))

    # Apply the transformations in the pipeline
    word_counts = (
        pipeline
        | 'Create sentences' >> beam.Create(sentences)  # Create a PCollection from the list of sentences
        | 'Count target words' >> beam.FlatMap(count_words, target_words)  # Apply the count_words function to each element in the PCollection
        | 'Combine counts' >> beam.CombinePerKey(sum)  # Combine the counts for each word across all elements in the PCollection
        | 'Print results' >> beam.Map(print)  # Print the results
    )


('Strawberry', 2)
('Carrot', 2)
('Potato', 2)


Composite Transformation

In [2]:
import apache_beam as beam

# Define a composite transform named CountWords
class CountWords(beam.PTransform):
    def expand(self, pcoll):
        return (pcoll
                # Step 1: Split each sentence into words
                | 'Split words' >> beam.FlatMap(lambda x: x.split())
                # Step 2: Pair each word with the number 1 (to count them)
                | 'Pair with 1' >> beam.Map(lambda x: (x, 1))
                # Step 3: Sum the counts for each word
                | 'Count per word' >> beam.CombinePerKey(sum))

# Define the main pipeline
with beam.Pipeline() as pipeline:
    # Define a list of sentences to process
    sentences = [
        'Strawberry is a delicious fruit',
        'Carrot is a healthy vegetable',
        'I love eating Strawberry and Carrot',
        'Potato is also a versatile vegetable',
        'Strawberry, Carrot, and Potato are popular ingredients'
    ]

    # Create a PCollection of sentences and apply the CountWords transform
    word_counts = (
        pipeline
        # Step 1: Create a PCollection from the list of sentences
        | 'Create sentences' >> beam.Create(sentences)
        # Step 2: Apply the CountWords composite transform to count words
        | 'Count words using composite transform' >> CountWords()
        # Step 3: Print the results to the console
        | 'Print results' >> beam.Map(print)
    )


('Strawberry', 2)
('is', 3)
('a', 3)
('delicious', 1)
('fruit', 1)
('Carrot', 2)
('healthy', 1)
('vegetable', 2)
('I', 1)
('love', 1)
('eating', 1)
('and', 2)
('Potato', 2)
('also', 1)
('versatile', 1)
('Strawberry,', 1)
('Carrot,', 1)
('are', 1)
('popular', 1)
('ingredients', 1)
