In [8]:
#!pip install apache_beam

**ParDo**

• ParDo is a Beam transform for generic parallel processing.

• The ParDo processing paradigm is similar to the “Map” phase of a Map/Shuffle/Reduce-style algorithm: a ParDo transform considers each element in the input PCollection, performs some processing function (your user code) on that element, and emits zero, one, or multiple elements to an output PCollection.

In [5]:
import apache_beam as beam

In [7]:
class SplitRow(beam.DoFn):
  def process(self, element):
    return[element.split(',')]


class ComputeWordLengthFn(beam.DoFn):
  def process(self, element):
    return [len(element)]


In [10]:
with beam.Pipeline() as pipeline:
  input_data = (pipeline
                | "read from text" >> beam.io.ReadFromText('/content/sample_data/students.txt', skip_header_lines=True)
                | "splitting the record" >> beam.ParDo(SplitRow()))


  count_data = (input_data
                |"filtering the data with 'FAIL'" >> beam.Filter(lambda record: record[5] == "FAIL"))


  word_lengths = (count_data
                   | "count of records" >> beam.ParDo(ComputeWordLengthFn()))


  counted_data = (word_lengths
                  | "write counted data to Text" >> beam.io.WriteToText("result/count_data"))


  output_data = (count_data
                 | "write to Text" >> beam.io.WriteToText("result/fail_data"))


In [13]:
!{('head -n 10 result/count_data-00000-of-00001')}

6
6
6
6


In [14]:
!{('head -n 10 /content/result/fail_data-00000-of-00001')}

['1', 'vignesh', 'chn', '27', '15', 'FAIL']
['2', 'joey', 'us', '51', '20', 'FAIL']
['6', 'sree', 'koc', '25', '27', 'FAIL']
['9', 'tinkle', 'ker', '27', '9', 'FAIL']


**Keys**



* Takes a collection of Key-Value pairs and returns the key of each element



In [17]:
with beam.Pipeline() as pipeline:
  icons_fruit = (pipeline
                 | "Garden plants" >> beam.Create([
                     ('🍓', 'Strawberry'),
                     ('🥕', 'Carrot'),
                     ('🍆', 'Eggplant'),
                     ('🍅', 'Tomato'),
                     ('🥔', 'Potato'),
                 ])
                 | "Keys" >> beam.Keys()
                 | beam.Map(print))


🍓
🥕
🍆
🍅
🥔


**Values:**



* Takes a collection of Key-value pairs, and return the value of each element.



In [19]:
with beam.Pipeline() as pipeline:
  values = (pipeline
            | "Garten plants" >> beam.Create([
                ('🍓', 'Strawberry'),
                ('🥕', 'Carrot'),
                ('🍆', 'Eggplant'),
                ('🍅', 'Tomato'),
                ('🥔', 'Potato'),
                ])
            | "Values" >> beam.Values()
            |beam.Map(print))

Strawberry
Carrot
Eggplant
Tomato
Potato


**ToString**

Transforms every element in an input collection to a string. Any non-string element can be converted to a string using standard Python functions and methods. Many I/O transforms, such as textio.WriteToText, expect their input elements to be strings.



1.  Key-value pairs to string.
2.  Elements to string.
3. Iterables to string.



In [25]:
with beam.Pipeline() as pipeline:
  plants = (pipeline
            | "Garden plant" >> beam.Create([
                 ('🍓', 'Strawberry'),
                 ('🥕', 'Carrot'),
                 ('🍆', 'Eggplant'),
                 ('🍅', 'Tomato'),
                 ('🥔', 'Potato'),
             ])
            | "To string" >> beam.ToString.Kvs()  #String
        #   | "To string" >> beam.ToString.Element() #Element
        #   | "To string" >> beam.ToString.Iterables()
            | beam.Map(print))

🍓,Strawberry
🥕,Carrot
🍆,Eggplant
🍅,Tomato
🥔,Potato


**Kvswap**

• Takes a collection of key-value pairs and returns a collection of key-value pairs which has each **key and value swapped**.

In [28]:
with beam.Pipeline() as pipeline:
  plants = (pipeline
            |'Garden plants' >> beam.Create([
              ('🍓', 'Strawberry'),
              ('🥕', 'Carrot'),
              ('🍆', 'Eggplant'),
              ('🍅', 'Tomato'),
              ('🥔', 'Potato'),
          ])
           | "key-value swap" >> beam.KvSwap()
           | beam.Map(print))

('Strawberry', '🍓')
('Carrot', '🥕')
('Eggplant', '🍆')
('Tomato', '🍅')
('Potato', '🥔')
