# Apache Beam: End‑to‑End Demo
Composite Transform
Pipeline IO
ParDo
Windowing
Map
Filter
Partition

## 1) Install & Imports

In [None]:
!pip -q install apache-beam==2.56.0 scikit-learn==1.5.2

In [1]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.window import FixedWindows
from apache_beam.testing.test_stream import TestStream

import re, os, numpy as np, pandas as pd
print("Beam version:", beam.__version__)

Beam version: 2.56.0


## 2) First Minimal pipeline
To understand how it works

In [2]:
def run_hello():
    with beam.Pipeline(options=PipelineOptions()) as p:
        (p | beam.Create(["hello","beam","from","colab"])
           | beam.Map(lambda s: s.upper())
           | beam.Map(print))

run_hello()





HELLO
BEAM
FROM
COLAB


## 3) Pipeline IO: ReadFromText & WriteToText

In [13]:
input_path = "./beam_io_input.txt"
with open(input_path, "w", encoding="utf-8") as f:
    f.write("\n".join([
        "Apache Beam makes data processing portable and unified",
        "Map and Filter are element-wise transforms",
        "ParDo runs user code (DoFn) on each element",
        "Windowing groups elements by event time windows"
    ]))

output_prefix = "./beam_io_output.txt"

def run_io():
    with beam.Pipeline(options=PipelineOptions()) as p:
        lines = p | beam.io.ReadFromText(input_path)
        words = (lines
                 | "Lower" >> beam.Map(lambda s: s.lower())
                 | "Tokens" >> beam.FlatMap(lambda s: re.findall(r"[a-z]+", s)))
        counts = (words | beam.Map(lambda w: (w,1)) | beam.CombinePerKey(sum))
        counts | beam.io.WriteToText(output_prefix)

run_io()



In [15]:
# printing the output file
# when you use WriteToText, it typically outputs files in a sharded format. The *-of-* pattern is a wildcard that helps to match these sharded output files.
output_path = output_prefix + "*-of-*"
import glob

# Find the actual output file(s)
output_files = glob.glob(output_path)

# Read and print the content of each output file
for file_path in output_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        print(f.read())

('apache', 1)
('beam', 1)
('makes', 1)
('data', 1)
('processing', 1)
('portable', 1)
('and', 2)
('unified', 1)
('map', 1)
('filter', 1)
('are', 1)
('element', 2)
('wise', 1)
('transforms', 1)
('pardo', 1)
('runs', 1)
('user', 1)
('code', 1)
('dofn', 1)
('on', 1)
('each', 1)
('windowing', 1)
('groups', 1)
('elements', 1)
('by', 1)
('event', 1)
('time', 1)
('windows', 1)



## 4) Map · Filter · ParDo (DoFn)



In [16]:
class CleanAndLength(beam.DoFn):
    def process(self, element: str):
        for t in re.findall(r"[a-z]+", element.lower()):
            if len(t) >= 4:
                yield (t, len(t))

def run_elementwise():
    data = ["Beam combines batch and streaming.",
            "ParDo lets you run your own functions.",
            "Filter discards, Map transforms."]
    with beam.Pipeline(options=PipelineOptions()) as p:
        (p | beam.Create(data)
           | "ParDoCleanLen" >> beam.ParDo(CleanAndLength())
           | "FilterLen>=5" >> beam.Filter(lambda kv: kv[1] >= 5)
           | "Fmt" >> beam.Map(lambda kv: f"{kv[0]}:{kv[1]}")
           | beam.Map(print))

run_elementwise()



combines:8
batch:5
streaming:9
pardo:5
functions:9
filter:6
discards:8
transforms:10


## 5) Composite Transform (PTransform)

In [17]:
class CleanTokenizeCount(beam.PTransform):
    def expand(self, pcoll):
        return (pcoll
                | beam.Map(lambda s: s.lower())
                | beam.FlatMap(lambda s: re.findall(r"[a-z]+", s))
                | beam.Map(lambda w: (w,1))
                | beam.CombinePerKey(sum))

def run_composite():
    data = ["Composite transforms encapsulate reusable logic.",
            "Encapsulation makes pipelines cleaner."]
    with beam.Pipeline(options=PipelineOptions()) as p:
        (p | beam.Create(data) | CleanTokenizeCount() | beam.Map(print))

run_composite()



('composite', 1)
('transforms', 1)
('encapsulate', 1)
('reusable', 1)
('logic', 1)
('encapsulation', 1)
('makes', 1)
('pipelines', 1)
('cleaner', 1)


## 6) Partition
split a PCollection into multiple PCollections

In [20]:
def part_fn(x, n):
    return 0 if x < 0 else (1 if x == 0 else 2)

def run_partition():
    nums = list(range(-5,6))
    with beam.Pipeline(options=PipelineOptions()) as p:
        neg, zero, pos = (p | beam.Create(nums)) | beam.Partition(part_fn, 3)
        neg  | "PrintNeg"  >> beam.Map(lambda x: ("neg", x))  | "PrintNegOutput" >> beam.Map(print)
        zero | "PrintZero" >> beam.Map(lambda x: ("zero", x)) | "PrintZeroOutput" >> beam.Map(print)
        pos  | "PrintPos"  >> beam.Map(lambda x: ("pos", x))  | "PrintPosOutput" >> beam.Map(print)

run_partition()



('neg', -5)
('neg', -4)
('neg', -3)
('neg', -2)
('neg', -1)
('zero', 0)
('pos', 1)
('pos', 2)
('pos', 3)
('pos', 4)
('pos', 5)


## 7) Windowing
Fixed windows with TestStream (event time)

In [22]:
from apache_beam.utils import timestamp
from apache_beam.transforms.window import FixedWindows
from apache_beam.testing.test_stream import TestStream

def run_windowing():
    start = timestamp.Timestamp(0)
    ts = (TestStream()
          .add_elements([beam.window.TimestampedValue(1, start + 0)])
          .advance_watermark_to(start + 5)
          .add_elements([beam.window.TimestampedValue(2, start + 6),
                         beam.window.TimestampedValue(3, start + 7)])
          .advance_watermark_to_infinity())

    with beam.Pipeline(options=PipelineOptions()) as p:
        (p | ts
           | "Win5s" >> beam.WindowInto(FixedWindows(5))
           | "Sum" >> beam.CombineGlobally(sum).without_defaults()
           | beam.Map(print))

run_windowing()

1
5
