In [None]:
!pip install apache-beam

# Apache Beam Data Processing Tutorial
This notebook contains tasks for formatting and transforming customer data using Apache Beam.


In [None]:
# Step 1: Import necessary libraries
import apache_beam as beam
import csv
from datetime import datetime

## Task 1: Data Formatting
The goal is to read the `users_v.csv` file, transform the data, and output it in a specific format.


In [None]:
class FormatRow(beam.DoFn):
    def process(self, element):
        user_id, name, gender, age, address, date_joined = element
        # Format the date to YYYY-MM-DD
        date_joined = datetime.strptime(date_joined, '%m/%d/%Y').strftime('%Y-%m-%d')
        # Format address
        address_parts = address.split(', ')
        formatted_address = f'{address_parts[0]},{address_parts[1]},{address_parts[2]}'
        yield f"{user_id};{name};{gender};{age};{formatted_address};{date_joined}"

def run_pipeline():
    with beam.Pipeline() as pipeline:
        (pipeline
         | 'Read CSV' >> beam.io.ReadFromText('users_v.csv')
         | 'Parse CSV' >> beam.Map(lambda line: next(csv.reader([line])))
         | 'Format Data' >> beam.ParDo(FormatRow())
         | 'Write to CSV' >> beam.io.WriteToText('marketing_format', file_name_suffix='.csv'))

run_pipeline()

## Task 2: Aggregations
We will now compute gender composition, daily account creations, and customer distribution by state.


In [None]:
class CountGender(beam.DoFn):
    def process(self, element):
        _, _, gender, _, _, _ = element
        yield gender

def gender_composition(pipeline):
    (pipeline
     | 'Read File' >> beam.io.ReadFromText('marketing_format.csv', skip_header_lines=1)
     | 'Parse CSV' >> beam.Map(lambda line: next(csv.reader([line], delimiter=';')))
     | 'Get Gender' >> beam.ParDo(CountGender())
     | 'Count Per Gender' >> beam.combiners.Count.PerElement()
     | 'Write Gender Count' >> beam.io.WriteToText('gender_count', file_name_suffix='.csv'))

gender_composition(beam.Pipeline())

In [None]:
class CountDateJoined(beam.DoFn):
    def process(self, element):
        _, _, _, _, _, date_joined = element
        yield date_joined

def customers_per_day(pipeline):
    (pipeline
     | 'Read File' >> beam.io.ReadFromText('marketing_format.csv', skip_header_lines=1)
     | 'Parse CSV' >> beam.Map(lambda line: next(csv.reader([line], delimiter=';')))
     | 'Get Date Joined' >> beam.ParDo(CountDateJoined())
     | 'Count Per Date' >> beam.combiners.Count.PerElement()
     | 'Write Date Count' >> beam.io.WriteToText('date_joined_count', file_name_suffix='.csv'))

customers_per_day(beam.Pipeline())

In [None]:
class CountState(beam.DoFn):
    def process(self, element):
        _, _, _, _, address, _ = element
        state = address.split(',')[1]
        yield state

def customers_per_state(pipeline):
    (pipeline
     | 'Read File' >> beam.io.ReadFromText('marketing_format.csv', skip_header_lines=1)
     | 'Parse CSV' >> beam.Map(lambda line: next(csv.reader([line], delimiter=';')))
     | 'Get State' >> beam.ParDo(CountState())
     | 'Count Per State' >> beam.combiners.Count.PerElement()
     | 'Write State Count' >> beam.io.WriteToText('state_count', file_name_suffix='.csv'))

customers_per_state(beam.Pipeline())