In [2]:
!pip install apache-beam

Collecting apache-beam
  Downloading apache_beam-2.63.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting crcmod<2.0,>=1.7 (from apache-beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.2,>=0.3.1.1 (from apache-beam)
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cloudpickle~=2.2.1 (from apache-beam)
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting fastavro<2,>=0.23.6 (from apache-beam)
  Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting fasteners<1.0,>=0.3 (from apache-beam)
  D

**Structure of Beam Code**
1.   Create Pipeline Object
2.   Input Data
3.   Transform Data
4.   Output Data
5.   Display Data

In [3]:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam import Create,Map

#Creating Pipeline Object
#Interactive runner is used to display data-- Print Works without this also
p1 = beam.Pipeline(InteractiveRunner())

findsquare= (p1
             | "Create Element" >> Create(range(10))
             |"Find Square" >> Map(lambda x:x*x)
             |"Print Square" >> Map(print)
             )

p1.run()
ib.show_graph(p1)
ib.show(findsquare) #not working for me but working for instructor



0
1
4
9
16
25
36
49
64
81


**Create in Apache Beam**

In [4]:
import apache_beam as beam
from apache_beam import Create, Map

p1=beam.Pipeline()
findCube=(p1
          |"Create Element" >> Create(range(5))
          |"Find Cube" >> Map(lambda x:(x*x)*x)
          |"Print" >> Map(print))
p1.run()

0
1
8
27
64


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbcdac150>

In [5]:
import apache_beam as beam
from apache_beam import Create, Map

p1=beam.Pipeline()
findCube=(p1
          |"Create Element" >> Create([4,5])
          |"Find Cube" >> Map(lambda x:x*x*x)
          |"Print" >> Map(print))
p1.run()

64
125


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbce34c10>

In [6]:
import apache_beam as beam
from apache_beam import Create, Map

p1=beam.Pipeline()
findCube=(p1
          |"Create Element" >> Create(["Hello","Welcome"])
          |"Find Cube" >> Map(lambda x:x.upper())
          |"Print" >> Map(print))
p1.run()

HELLO
WELCOME


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbcb810d0>

In [7]:
import apache_beam as beam
from apache_beam import Create, Map

p1=beam.Pipeline()
findCube=(p1
          |"Create Element" >> Create({"foo":"bar","hello":"welcome"})
          |"Find Cube" >> Map(lambda x:x[1])
          |"Print" >> Map(print))
p1.run()

bar
welcome


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbdefc0d0>

**Flatten in Apache Beam**

In [8]:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

# Define the pipeline with InteractiveRunner
p1 = beam.Pipeline(InteractiveRunner())

# Watch the pipeline variables for interactive execution
ib.watch(locals())

# Define collections as lists instead of sets
odd = [1, 3, 5, 7]
even = [2, 4, 6, 8]

# Create PCollections
even_p1 = (p1 | "Create Even" >> beam.Create(even)
             #| "print" >> beam.Map(print)
             )
#####  If I add above print I was getting 4 none bcoz print returns none
#####  but I was also seeing even_nmber bcoz they where print before flatten ran

odd_p1 = p1 | "Create Odd" >> beam.Create(odd)

# Flatten the PCollections and print the output
flatten = (
    (even_p1, odd_p1)
    | "Flatten PCollections" >> beam.Flatten()
    | "Print Flattened" >> beam.Map(print)
)

# Show interactive graph
ib.show_graph(p1)

# Run the pipeline
p1.run()


## Order is not preserved


2
4
6
8
1
3
5
7


<apache_beam.runners.interactive.interactive_runner.PipelineResult at 0x784fbcc1f590>

**Map Vs Flatmap**

In [9]:
import apache_beam as beam
from apache_beam import Create,Map
p1=beam.Pipeline()
def _find_cube(x):
  return x*x*x
find_cube=(p1
           |"Create Element" >> Create(range(10))
           |"Cube" >> Map(_find_cube)
           |"Print" >> beam.Map(print)
           )
p1.run()

0
1
8
27
64
125
216
343
512
729


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbe0a0d50>

In [10]:
import apache_beam as beam
from apache_beam import Create,Map
p2=beam.Pipeline()

find_cube=(p2
           |"Create Element" >> Create(["foo bar","hello world"])
           |"Split" >> Map(lambda x:x.split())
           |"Print" >> beam.Map(print)
           )
p2.run()

['foo', 'bar']
['hello', 'world']


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbc89d550>

In [11]:
# Using Flatmap

import apache_beam as beam
from apache_beam import Create,Map,FlatMap
p2=beam.Pipeline()

find_cube=(p2
           |"Create Element" >> Create(["foo bar","hello world"])
           |"Split" >> Map(lambda x:x.split())
           |"Flatmap" >> FlatMap(lambda x:x)
           |"Print" >> beam.Map(print)
           )
p2.run()

foo
bar
hello
world


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbc9e3ad0>

**Filter**

In [12]:
import apache_beam as beam
from apache_beam import Create,Filter,Map

p1=beam.Pipeline()

filter_even=(p1
             |"Create Range" >> Create(range(20))
             |"Filter" >> Filter(lambda x: (x%2)==0)
             |"print" >> Map(print))
p1.run()


0
2
4
6
8
10
12
14
16
18


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbccf9190>

**Pardo**

In [13]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.coders import StrUtf8Coder  # ✅ Import coders to avoid NameError
from apache_beam.transforms.userstate import BagStateSpec, ReadModifyWriteStateSpec
from apache_beam.coders import VarIntCoder # Import the required coder
from apache_beam.transforms.window import FixedWindows
import datetime

# 1️⃣ **Basic Processing - Multiply each element by 2**
class MultiplyByTwo(beam.DoFn):
    def process(self, element):
        yield element * 2  # Each element is doubled

# 1️⃣ **Basic Processing - Multiply each element by External Factor++**
class MultiplyByFactor(beam.DoFn):
    def process(self, element, factor):
        yield element * factor  # Multiply element by an external factor



# 3️⃣ **Emitting Multiple Values per Element**
class EmitMultiple(beam.DoFn):
    def process(self, element):
        yield element
        yield element * 10  # Emits both original and a transformed version

# 4️⃣ **Working with Timestamps**
class PrintTimestamp(beam.DoFn):
    def process(self, element, timestamp=beam.DoFn.TimestampParam):
        yield f"Element: {element}, Timestamp: {timestamp}"

# 5️⃣ **Using Tagged Outputs (Splitting Data)**
class CategorizeNumbers(beam.DoFn):
    def process(self, element):
        if element % 2 == 0:
            yield beam.pvalue.TaggedOutput('even', element)  # Even numbers
        else:
            yield beam.pvalue.TaggedOutput('odd', element)   # Odd numbers


# ✅ Create Pipeline with Options
options = PipelineOptions()
with beam.Pipeline(options=options) as p:

    # Step 1: Create a PCollection of Numbers
    numbers = (
        p
        | "Create Numbers" >> beam.Create([1, 2, 3, 4, 5, 6])
     )

    strings= (p
              | "Create String" >> beam.Create(["foo bar","Hellow World"])
    )

    # Step 1: Apply Basic Transformation
    multiplied = numbers | "Multiply by 2" >> beam.ParDo(MultiplyByTwo())

    # Step 2: Apply Factor Muliplication
    factor = 10
    multiplied_factor = numbers | "Multiply by Factor" >> beam.ParDo(MultiplyByFactor(), factor)

    #Step 3 : Using like flatmap
    faltten_word=strings |"Flatmap" >> beam.ParDo(lambda x:x.split(" "))

    #Step 4: Emmiting both output origina and transformed
    emmit_multi=numbers |"Emmit Multiple" >> beam.ParDo(EmitMultiple())

    # Step 5: Extract Timestamps
    timestamped = multiplied | "Get Timestamp" >> beam.ParDo(PrintTimestamp())

     # Step 6: Split into Even and Odd using Tagged Outputs
    categorized = numbers | "Categorize Numbers" >> beam.ParDo(CategorizeNumbers()).with_outputs('even', 'odd')




    #multiplied | "Multiplied" >> beam.Map(print)
    #multiplied_factor | "Multiplied Factor" >> beam.Map(print)
    #faltten_word | "Flatten print" >> beam.Map(print)
    #emmit_multi | "Emmit print" >> beam.Map(print)
    #timestamped | "Timestamped print" >> beam.Map(print)
    #categorized.even | "Print Even Numbers" >> beam.Map(lambda x: print(f"Even: {x}"))
    #categorized.odd | "Print Odd Numbers" >> beam.Map(lambda x: print(f"Odd: {x}"))







**Keys, Values, String, Kvswap**

In [14]:
from os import POSIX_FADV_SEQUENTIAL
import apache_beam as beam
from apache_beam import Create,Map

p1=beam.Pipeline()
findcube=(p1
          |"Create Element" >> Create({"foo":"bar" , "Hello":"World"})
          #|"All Keys" >> beam.Keys()
          #|"All Values" >> beam.Values()
          #|"String" >> beam.ToString().Element()  #converts any pairs into a formatted string representation
          #|"String" >> beam.ToString().Kvs() #converts key-value (KV) pairs into a formatted string representation
          |"String" >> beam.Map(str)    #converts to string can convert number to string
          #|"Kvswap" >> beam.KvSwap()   #swao keys and values
          |"print" >> Map(print))

p1.run()

('foo', 'bar')
('Hello', 'World')


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbcbfc790>

**Partition**

In [15]:
%pip install sympy
# we are using % so we install in same kernal where apache beam is installed



In [16]:
import sympy
import apache_beam as beam
from apache_beam import Create,Map

p1 =beam.Pipeline()
def is_prime_function(element, no_par):
  return 1 if sympy.isprime(element) else 0

is_prime=(p1
          |"Create Element" >> Create(range(10))
          |"Partition" >> beam.Partition(is_prime_function,2))

is_prime[0]| "Print Prime Number" >> Map(print)
is_prime[1]| "Print Non Prime Number" >> Map(print)
p1.run()

0
1
2
3
4
5
6
7
8
9


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fb6198b50>

In [17]:
import apache_beam as beam

# Sample function to simulate percentile calculation
def get_percentile(student):
    return student["percentile"]

# Partitioning function
def partition_fn(student, num_partitions):
    return int(get_percentile(student) * num_partitions / 100)

# Apache Beam Pipeline
with beam.Pipeline() as p:
    students = p | "Create Students" >> beam.Create([
        {"name": "Alice", "percentile": 92},
        {"name": "Bob", "percentile": 45},
        {"name": "Charlie", "percentile": 78},
        {"name": "David", "percentile": 30},
    ])

    # Partition into 10 buckets
    by_decile = students | "Partition Students" >> beam.Partition(partition_fn, 10)

    # Extract the 40th percentile students
    fortieth_percentile = by_decile[4] | "Print 40th Percentile" >> beam.Map(print)


{'name': 'Bob', 'percentile': 45}


**Regex**

In [18]:
import apache_beam as beam
from apache_beam import Create,Map,Regex

p1=beam.Pipeline()

findnumber=(p1
            | "Create Element" >> Create(["1","2","Hello","welcome"])
            |"Regex" >> beam.Regex.matches("[0-9]+")
            |"Print" >> Map(print))
p1.run()


1
2


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbdfc8190>

In [19]:
import apache_beam as beam
from apache_beam import Create,Map,Regex

p1=beam.Pipeline()

findnumber=(p1
            | "Create Element" >> Create(["1","2","Hello","welcome"])
            |"Regex" >> beam.Regex.matches("[a-z]+")
            |"Print" >> Map(print))
p1.run()

welcome


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x784fbcb4d610>