Map: Many to one

In [None]:
import apache_beam as beam

with beam.Pipeline() as pipeline:
    # Create a PCollection of numbers
    numbers = pipeline | beam.Create([1, 2, 3, 4, 5])
    
    # Apply a Map transformation to square each number
    squared_numbers = numbers | beam.Map(lambda x: x * x)
    
    # Print the squared numbers
    squared_numbers | beam.Map(print)


FlatMap: Many mapping

In [2]:
import apache_beam as beam

with beam.Pipeline() as pipeline:
    # Create a PCollection of sentences
    sentences = pipeline | beam.Create([
        'Hello world',
        'Apache Beam is powerful',
        'FlatMap transformation example'
    ])
    
    # Apply a FlatMap transformation to split each sentence into words
    words = sentences | beam.FlatMap(lambda sentence: sentence.split())
    
    # Print the individual words
    words | beam.Map(print)


Hello
world
Apache
Beam
is
powerful
FlatMap
transformation
example


ParDo

In [3]:
import apache_beam as beam

class SplitWords(beam.DoFn):
    def process(self, element):
        """Splits each sentence into words using both space and comma as separators."""
        words = element.replace(',', ' ').split()
        return words

with beam.Pipeline() as pipeline:
    # Create a PCollection of sentences
    sentences = pipeline | beam.Create([
        'Hello,world',
        'Apache Beam,is powerful',
        'ParDo transformation,example'
    ])
    
    # Apply a ParDo transformation to split each sentence into words
    words = sentences | beam.ParDo(SplitWords())
    
    # Print the individual words
    words | beam.Map(print)


Hello
world
Apache
Beam
is
powerful
ParDo
transformation
example


Filter

In [4]:
import apache_beam as beam

with beam.Pipeline() as pipeline:
    # Create a PCollection of numbers
    numbers = pipeline | beam.Create([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    
    # Apply a Filter transformation to keep only even numbers
    even_numbers = numbers | beam.Filter(lambda x: x % 2 == 0)
    
    # Print the filtered even numbers
    even_numbers | beam.Map(print)


2
4
6
8
10


Flatten

In [6]:
import apache_beam as beam

with beam.Pipeline() as pipeline:
    # Create two PCollections of words with unique labels
    pcollection1 = pipeline | beam.Create(['Hello', 'Apache', 'Beam'])  # No need to specify label explicitly for the first transform
    pcollection2 = pipeline | 'CreatePCollection2' >> beam.Create(['is', 'powerful', 'and', 'flexible'])
    
    # Apply Flatten to merge the two PCollections into a single PCollection
    merged_pcollection = (pcollection1, pcollection2) | beam.Flatten()
    
    # Print the merged PCollection
    merged_pcollection | beam.Map(print)



is
powerful
and
flexible
Hello
Apache
Beam


Partition

In [8]:
import apache_beam as beam

# Function to print with a prefix
def print_with_prefix(element, prefix):
    print(f"{prefix}: {element}")

# Function to partition elements based on their value
def partition_fn(element, num_partitions):
    if element % 2 == 0:
        return 0  # Place even numbers in partition 0
    else:
        return 1  # Place odd numbers in partition 1

with beam.Pipeline() as pipeline:
    # Create a PCollection of numbers
    numbers = pipeline | beam.Create([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    
    # Apply Partition transformation to split numbers into two partitions
    partitioned_numbers = numbers | beam.Partition(partition_fn, 2)
    
    # Extract and print elements from each partition with prefix
    partitioned_numbers[0] | 'Print Partition 0' >> beam.Map(print_with_prefix, prefix='Partition 0')
    partitioned_numbers[1] | 'Print Partition 1' >> beam.Map(print_with_prefix, prefix='Partition 1')


Partition 1: 1
Partition 0: 2
Partition 1: 3
Partition 0: 4
Partition 1: 5
Partition 0: 6
Partition 1: 7
Partition 0: 8
Partition 1: 9
Partition 0: 10


GroupByKey

In [9]:
import apache_beam as beam

with beam.Pipeline() as pipeline:
    # Create a PCollection of key-value pairs
    elements = pipeline | beam.Create([
        ('a', 1), ('b', 2), ('a', 3), ('b', 4), ('c', 5)
    ])
    
    # Apply GroupByKey to group elements by key
    grouped_elements = elements | beam.GroupByKey()
    
    # Print the grouped elements
    grouped_elements | beam.Map(print)


('a', [1, 3])
('b', [2, 4])
('c', [5])


CoGroupByKey

In [18]:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

# Sample data
jobs = [
    ("sam", "Engineer"),
    ("sandeep", "Devops Engineer"),
    ("alice", "Data Scientist"),
    ("bob", "Software Developer"),
]

city = [
    ("sam", "Delhi"),
    ("sandeep", "Pune"),
    ("alice", "New York"),
    ("bob", "San Francisco"),
]

# Create a pipeline using InteractiveRunner
p = beam.Pipeline(InteractiveRunner())

# Create PCollections for jobs and city data
jobs_create = p | "Create Jobs" >> beam.Create(jobs)
city_create = p | "Create City" >> beam.Create(city)

# Perform CoGroupByKey to merge based on the common key
cogbk = (jobs_create, city_create) | "CoGroupByKey" >> beam.CoGroupByKey()

# Show the result using interactive_beam
ib.show(cogbk)
