In [19]:
import apache_beam as beam
import os

if os.path.isfile('voos_text.txt-00000-of-00001'):
    os.remove('voos_text.txt-00000-of-00001')

p = beam.Pipeline()

"""
beam.Map retorna apenas 1 elemento por iteração 
"""
p_collection = (
    p
    | "Importar Dados" >> beam.io.ReadFromText("voos_sample.csv", skip_header_lines=1)
    | "Separar por Virgulas" >> beam.Map(lambda record: record.split(','))
    #| "Mostrar Resultados" >> beam.Map(print)
    | "Escrever Resultados" >> beam.io.WriteToText("voos_text.txt")
)

p.run()

<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x19851e58380>

In [5]:
import apache_beam as beam

p | "Lista" >> beam.Create( [ [1, 2, 3], [3, 4, 6] ]) | "Print Lista" >> beam.Map(print)

p.run()

[1, 2, 3]
[3, 4, 6]


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x1984fc68aa0>

In [None]:
import apache_beam as beam
import os

if os.path.isfile('resultado_poema.txt-00000-of-00001'):
    os.remove('resultado_poema.txt-00000-of-00001')

p = beam.Pipeline()

"""
beam.FlatMap retorna multiplos elementos por iteração
"""
p_collection = (
    p
    | beam.io.ReadFromText("poema.txt")
    | beam.FlatMap(lambda record: record.split(' '))
    | beam.io.WriteToText("resultado_poema.txt")
)

p.run()

<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x19851e590d0>

In [22]:
import apache_beam as beam
import os

p = beam.Pipeline()

p_collection = (
    p
    | beam.io.ReadFromText("poema.txt")
    | beam.FlatMap(lambda record: record.split(' '))
    | beam.Filter(lambda value: value in ['quatro', 'um'])
    | beam.Map(print)
)

p.run()

quatro
quatro
um
quatro
quatro
um


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x19851e596a0>

In [41]:
import apache_beam as beam
import os

p = beam.Pipeline()


class FilterWithoutDelays(beam.DoFn):
    def process(self, element, *args, **kwargs):
        if int(element[8]) > 0:
            return [element] 

def process_data(pipeline: beam.Pipeline, type_of_pipeline: str):
    return (
        pipeline
        | f"Importar Dados ({type_of_pipeline})" >> beam.io.ReadFromText("voos_sample.csv", skip_header_lines=1)
        | f"Separar por Virgulas ({type_of_pipeline})" >> beam.Map(lambda record: record.split(','))
        | f"Voos sem atraso ({type_of_pipeline})" >> beam.ParDo(FilterWithoutDelays())
        | f"Criar par ({type_of_pipeline})" >> beam.Map(lambda record: ( record[4], int(record[8]) ) )
    )

# GROUP BY + SUM
soma_atrasos = (
    process_data(p, "Soma Atrasos")
    | "Somar por key" >> beam.CombinePerKey(sum)
)

# GROUP BY + COUNT 
contagem_atrasos = (
    process_data(p, "Contagem Atrasos")
    | "Contar por key" >> beam.combiners.Count.PerKey()
)


tabela_atrasos = (
    { 'contagem_atrasos': contagem_atrasos, 'soma_atrasos': soma_atrasos }
    | 'Group By' >> beam.CoGroupByKey()
    | beam.Map(print)
)

p.run()

('LAX', {'contagem_atrasos': [4], 'soma_atrasos': [92]})
('HNL', {'contagem_atrasos': [1], 'soma_atrasos': [15]})
('DFW', {'contagem_atrasos': [1], 'soma_atrasos': [95]})
('OGG', {'contagem_atrasos': [1], 'soma_atrasos': [138]})
('JFK', {'contagem_atrasos': [4], 'soma_atrasos': [220]})


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x19852a07620>