In [2]:
# Question 4.1 Join the input files as streams using CoGroupByKey
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

def analyze_purchases():
    """
    This pipeline analyzes customer purchasing behavior by combining user and order data.
    """
    options = PipelineOptions(runner='DirectRunner')

    with beam.Pipeline(options=options) as p:
        # 1. Read and parse users_v.csv
        users = (
            p
            | 'ReadUsers' >> beam.io.ReadFromText('users_v.csv', skip_header_lines=1)
            | 'SplitUsers' >> beam.Map(lambda line: line.split(','))
            | 'FilterBlankUsers' >> beam.Filter(lambda fields: len(fields) > 5)
            | 'FormatUsers' >> beam.Map(lambda fields: (fields[0], {
                'name': fields[1],
                'gender': fields[2],
                'age': int(fields[3]),
                'address': fields[4],
                'date_joined': fields[5]
            }))
        )

        # 2. Read and parse orders_v_2022.csv
        orders = (
            p
            | 'ReadOrders' >> beam.io.ReadFromText('orders_v_2022.csv', skip_header_lines=1)
            | 'SplitOrders' >> beam.Map(lambda line: line.split(';'))
            | 'FilterBlankOrders' >> beam.Filter(lambda fields: len(fields) > 3)
            | 'FormatOrders' >> beam.Map(lambda fields: (fields[1], {
                'order_no': fields[0],
                'product_list': [product.strip() for product in fields[2].split(',')],
                'date_purchased': fields[3]
            }))
        )

        # 3. Join the PCollections
        combined_data = (
            {'users': users, 'orders': orders}
            | 'GroupByUser' >> beam.CoGroupByKey()
            | 'FilterEmptyOrders' >> beam.Filter(lambda element: len(element[1]['orders']) > 0 and len(element[1]['users']) > 0)
        )

        # 4. Perform Analytics

        # Insight 1: Popular Products
        product_counts = (
            combined_data
            | 'ExtractProducts' >> beam.FlatMap(
                lambda element: [(product, 1) for order in element[1]['orders'] for product in order['product_list']]
            )
            | 'CountProducts' >> beam.CombinePerKey(sum)
            | 'SortProducts' >> beam.transforms.combiners.Top.Of(10, key=lambda x: x[1])
        )

        # Insight 2: Purchase behavior by gender
        gender_purchases = (
            combined_data
            | 'ExtractGenderProducts' >> beam.FlatMap(
                lambda element: [
                    ((element[1]['users'][0]['gender'], product), 1)
                    for order in element[1]['orders']
                    for product in order['product_list']
                ]
            )
            | 'CountGenderProducts' >> beam.CombinePerKey(sum)
            | 'RegroupByGender' >> beam.Map(lambda element: (element[0][0], (element[0][1], element[1])))
            | 'GroupProductsByGender' >> beam.GroupByKey()
            | 'Top5ProductsByGender' >> beam.Map(
                lambda element: (element[0], sorted(element[1], key=lambda x: x[1], reverse=True)[:5])
            )
        )

        # Insight 3: Purchase behavior by age group
        def get_age_group(age):
            if 18 <= age <= 24: return '18-24'
            elif 25 <= age <= 34: return '25-34'
            elif 35 <= age <= 44: return '35-44'
            elif 45 <= age <= 54: return '45-54'
            elif 55 <= age <= 64: return '55-64'
            else: return '65+'

        age_group_purchases = (
            combined_data
            | 'ExtractAgeGroupProducts' >> beam.FlatMap(
                lambda element: [
                    ((get_age_group(element[1]['users'][0]['age']), product), 1)
                    for order in element[1]['orders']
                    for product in order['product_list']
                ]
            )
            | 'CountAgeGroupProducts' >> beam.CombinePerKey(sum)
            | 'RegroupByAgeGroup' >> beam.Map(lambda element: (element[0][0], (element[0][1], element[1])))
            | 'GroupProductsByAgeGroup' >> beam.GroupByKey()
            | 'Top5ProductsByAgeGroup' >> beam.Map(
                lambda element: (element[0], sorted(element[1], key=lambda x: x[1], reverse=True)[:5])
            )
        )

        # Insight 4: Location-based analysis (by State)
        # --- THIS IS THE CORRECTED FUNCTION ---
        def get_state(address):
            try:
                # This will attempt to get the state as before
                parts = address.split('-')
                return parts[-2]
            except IndexError:
                # If the address format is wrong, it will return 'Unknown'
                # instead of crashing the program.
                return 'Unknown'

        state_purchases = (
            combined_data
            | 'ExtractStateProducts' >> beam.FlatMap(
                lambda element: [
                    ((get_state(element[1]['users'][0]['address']), product), 1)
                    for order in element[1]['orders']
                    for product in order['product_list']
                ]
            )
            | 'CountStateProducts' >> beam.CombinePerKey(sum)
            | 'RegroupByState' >> beam.Map(lambda element: (element[0][0], (element[0][1], element[1])))
            | 'GroupProductsByState' >> beam.GroupByKey()
            | 'Top5ProductsByState' >> beam.Map(
                lambda element: (element[0], sorted(element[1], key=lambda x: x[1], reverse=True)[:5])
            )
        )

        # 5. Print results to the console
        # --- FIX: Iterate over the list of top products ---
        product_counts | 'FormatProductCounts' >> beam.Map(lambda top_list: "Top 10 most popular products:\n" + "\n".join(f"- {p}: {c}" for p, c in top_list)) | 'PrintProductCounts' >> beam.Map(print)
        gender_purchases | 'FormatGenderPurchases' >> beam.Map(lambda x: f"\nTop 5 products for {x[0]}:\n" + "\n".join(f"- {p}: {c}" for p, c in x[1])) | 'PrintGenderPurchases' >> beam.Map(print)
        age_group_purchases | 'FormatAgeGroupPurchases' >> beam.Map(lambda x: f"\nTop 5 products for age group {x[0]}:\n" + "\n".join(f"- {p}: {c}" for p, c in x[1])) | 'PrintAgeGroupPurchases' >> beam.Map(print)
        state_purchases | 'FormatStatePurchases' >> beam.Map(lambda x: f"\nTop 5 products for state {x[0]}:\n" + "\n".join(f"- {p}: {c}" for p, c in x[1])) | 'PrintStatePurchases' >> beam.Map(print)

# Run the pipeline
analyze_purchases()

ERROR:apache_beam.runners.common:too many values to unpack (expected 2) [while running 'FormatProductCounts']
Traceback (most recent call last):
  File "apache_beam/runners/common.py", line 1498, in apache_beam.runners.common.DoFnRunner.process
  File "apache_beam/runners/common.py", line 685, in apache_beam.runners.common.SimpleInvoker.invoke_process
  File "/usr/local/lib/python3.12/dist-packages/apache_beam/transforms/core.py", line 2098, in <lambda>
    wrapper = lambda x: [fn(x)]
                         ^^^^^
  File "/tmp/ipython-input-1492020752.py", line 132, in <lambda>
    product_counts | 'FormatProductCounts' >> beam.Map(lambda top_list: "Top 10 most popular products:\n" + "\n".join(f"- {p}: {c}" for p, c in top_list[0])) | 'PrintProductCounts' >> beam.Map(print)
                                                                                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-1492020752.py", line 132, in <


Top 5 products for female:
- Swiss Chard: 37868
- Kale: 37655
- Spinach: 37532
- Arugula: 19470
- Tomato: 19208

Top 5 products for male:
- Swiss Chard: 39951
- Kale: 39746
- Spinach: 39524
- Parsnip: 20147
- Cabbage: 20123

Top 5 products for age group 55-64:
- Swiss Chard: 12630
- Kale: 12323
- Spinach: 12220
- Radicchio: 6352
- Jícama: 6334

Top 5 products for age group 65+:
- Swiss Chard: 20021
- Spinach: 19861
- Kale: 19854
- Tomatillo: 10186
- String Beans: 10175

Top 5 products for age group 35-44:
- Kale: 13167
- Swiss Chard: 12954
- Spinach: 12892
- Cassava: 6738
- Arugula: 6671

Top 5 products for age group 25-34:
- Kale: 12928
- Swiss Chard: 12926
- Spinach: 12918
- Green Beans: 6702
- Olive: 6675

Top 5 products for age group 45-54:
- Swiss Chard: 11389
- Spinach: 11362
- Kale: 11339
- Beans: 5843
- Bell Pepper: 5837

Top 5 products for age group 18-24:
- Swiss Chard: 7899
- Spinach: 7803
- Kale: 7790
- Shallots: 4171
- Pumpkin: 4131

Top 5 products for state MN:
- Swiss C

ValueError: too many values to unpack (expected 2) [while running 'FormatProductCounts']

In [3]:
# Question 4.2 Perform a transformation that determines the average number of orders for female and male customers,
# respectively. Output the result as text in the notebook.
# Step 1: Install Apache Beam
!pip install apache-beam

# Step 2: Run the analysis pipeline
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

def average_orders_by_gender():
    """
    Calculates the average number of orders for female and male customers.
    """
    options = PipelineOptions(runner='DirectRunner')

    with beam.Pipeline(options=options) as p:
        # 1. Read and parse users_v.csv to get (user_id, gender)
        users_gender = (
            p
            | 'ReadUsers' >> beam.io.ReadFromText('users_v.csv', skip_header_lines=1)
            | 'SplitUsers' >> beam.Map(lambda line: line.split(','))
            | 'FilterBlankUsers' >> beam.Filter(lambda fields: len(fields) > 2)
            | 'FormatUsers' >> beam.Map(lambda fields: (fields[0], fields[2])) # (user_id, gender)
        )

        # 2. Read and parse orders_v_2022.csv to count orders per user
        orders_per_user = (
            p
            | 'ReadOrders' >> beam.io.ReadFromText('orders_v_2022.csv', skip_header_lines=1)
            | 'SplitOrders' >> beam.Map(lambda line: line.split(';'))
            | 'FilterBlankOrders' >> beam.Filter(lambda fields: len(fields) > 1)
            | 'MapUserToOrder' >> beam.Map(lambda fields: (fields[1], 1)) # (user_id, 1)
            | 'CountOrders' >> beam.CombinePerKey(sum) # (user_id, total_orders)
        )

        # 3. Join the two PCollections
        joined_data = (
            {'gender': users_gender, 'orders': orders_per_user}
            | 'CoGroup' >> beam.CoGroupByKey()
             # The result is like: (user_id, {'gender': [gender], 'orders': [total_orders]})
            | 'FilterValidJoins' >> beam.Filter(lambda element: len(element[1]['gender']) > 0 and len(element[1]['orders']) > 0)
        )

        # 4. Prepare for averaging by creating pairs of (total_orders, 1) for each gender
        gender_order_counts = (
            joined_data
            | 'MapGenderToOrders' >> beam.Map(lambda element: (element[1]['gender'][0], (element[1]['orders'][0], 1)))
            # The result is like: (gender, (total_orders, 1))
        )

        # 5. Calculate the average by summing the orders and counts for each gender
        average_by_gender = (
            gender_order_counts
            | 'Combine' >> beam.CombinePerKey(
                # Sum the orders (index 0) and the counts (index 1)
                lambda values: (sum(v[0] for v in values), sum(v[1] for v in values))
            )
            # The result is like: (gender, (sum_of_all_orders, count_of_users))
            | 'CalculateAverage' >> beam.Map(lambda element: (element[0], element[1][0] / element[1][1]))
        )

        # 6. Format the output for printing
        (
            average_by_gender
            | 'FormatOutput' >> beam.Map(lambda kv: f"Average orders for {kv[0]}s: {kv[1]:.2f}")
            | 'PrintOutput' >> beam.Map(print)
        )

# Run the pipeline
average_orders_by_gender()





Average orders for males: 756.22
Average orders for females: 756.83


In [4]:
# Question 4.3 Duplicate the code of previous question (in a new cell) and split your pipe to produce/emit the total
#number or orders processed as an additional text output (i.e. output the number of orders processed as
#well the average orders for female and male customers).
# Step 1: Install Apache Beam
#!pip install apache-beam

# Step 2: Run the analysis pipeline
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

def average_and_total_orders():
    """
    Calculates the average number of orders by gender and the total number of orders processed.
    """
    options = PipelineOptions(runner='DirectRunner')

    with beam.Pipeline(options=options) as p:
        # 1. Read and parse users_v.csv to get (user_id, gender)
        users_gender = (
            p
            | 'ReadUsers' >> beam.io.ReadFromText('users_v.csv', skip_header_lines=1)
            | 'SplitUsers' >> beam.Map(lambda line: line.split(','))
            | 'FilterBlankUsers' >> beam.Filter(lambda fields: len(fields) > 2)
            | 'FormatUsers' >> beam.Map(lambda fields: (fields[0], fields[2])) # (user_id, gender)
        )

        # 2. Read and parse orders_v_2022.csv
        orders = (
            p
            | 'ReadOrders' >> beam.io.ReadFromText('orders_v_2022.csv', skip_header_lines=1)
            | 'SplitOrders' >> beam.Map(lambda line: line.split(';'))
            | 'FilterBlankOrders' >> beam.Filter(lambda fields: len(fields) > 1)
        )

        # --- NEW BRANCH: Calculate and print the total number of orders ---
        total_orders = (
            orders
            | 'Count All Orders' >> beam.combiners.Count.Globally()
            | 'Format Total' >> beam.Map(lambda count: f"Total orders processed: {count}")
            | 'Print Total' >> beam.Map(print)
        )
        # -----------------------------------------------------------------

        # --- EXISTING BRANCH: Calculate average orders by gender ---
        orders_per_user = (
            orders
            | 'MapUserToOrder' >> beam.Map(lambda fields: (fields[1], 1)) # (user_id, 1)
            | 'CountOrders' >> beam.CombinePerKey(sum) # (user_id, total_orders)
        )

        joined_data = (
            {'gender': users_gender, 'orders': orders_per_user}
            | 'CoGroup' >> beam.CoGroupByKey()
            | 'FilterValidJoins' >> beam.Filter(lambda element: len(element[1]['gender']) > 0 and len(element[1]['orders']) > 0)
        )

        gender_order_counts = (
            joined_data
            | 'MapGenderToOrders' >> beam.Map(lambda element: (element[1]['gender'][0], (element[1]['orders'][0], 1)))
        )

        average_by_gender = (
            gender_order_counts
            | 'Combine' >> beam.CombinePerKey(
                lambda values: (sum(v[0] for v in values), sum(v[1] for v in values))
            )
            | 'CalculateAverage' >> beam.Map(lambda element: (element[0], element[1][0] / element[1][1]))
        )

        (
            average_by_gender
            | 'FormatOutput' >> beam.Map(lambda kv: f"Average orders for {kv[0]}s: {kv[1]:.2f}")
            | 'PrintOutput' >> beam.Map(print)
        )

# Run the pipeline
average_and_total_orders()



Total orders processed: 1783119
Average orders for males: 756.22
Average orders for females: 756.83


In [5]:
# Question 4.4 Generate a graph (using the Beam library) of your pipeline
# Step 1: Install the necessary interactive components for Apache Beam
# This includes the libraries required for rendering graphs.
!pip install apache-beam[interactive]

# Step 2: Import necessary libraries
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
# Import the interactive runner and visualization tools
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

# Step 3: Define the pipeline structure within a function
def define_pipeline_for_graphing():
    """
    Defines the pipeline for calculating averages and totals.
    This function will create and return the pipeline object without running it.
    """
    # Use the InteractiveRunner, which is required for visualization
    p = beam.Pipeline(InteractiveRunner())

    # --- Pipeline Definition ---

    # 1. Read users data
    users_gender = (
        p
        | 'ReadUsers' >> beam.io.ReadFromText('users_v.csv', skip_header_lines=1)
        | 'SplitUsers' >> beam.Map(lambda line: line.split(','))
        | 'FilterBlankUsers' >> beam.Filter(lambda fields: len(fields) > 2)
        | 'FormatUsers' >> beam.Map(lambda fields: (fields[0], fields[2]))
    )

    # 2. Read orders data
    orders = (
        p
        | 'ReadOrders' >> beam.io.ReadFromText('orders_v_2022.csv', skip_header_lines=1)
        | 'SplitOrders' >> beam.Map(lambda line: line.split(';'))
        | 'FilterBlankOrders' >> beam.Filter(lambda fields: len(fields) > 1)
    )

    # 3. Branch 1: Calculate total orders
    total_orders_count = (
        orders
        | 'Count All Orders' >> beam.combiners.Count.Globally()
    )

    # 4. Branch 2: Calculate average orders by gender
    orders_per_user = (
        orders
        | 'MapUserToOrder' >> beam.Map(lambda fields: (fields[1], 1))
        | 'CountOrders' >> beam.CombinePerKey(sum)
    )

    joined_data = (
        {'gender': users_gender, 'orders': orders_per_user}
        | 'CoGroup' >> beam.CoGroupByKey()
        | 'FilterValidJoins' >> beam.Filter(lambda element: len(element[1]['gender']) > 0 and len(element[1]['orders']) > 0)
    )

    gender_order_counts = (
        joined_data
        | 'MapGenderToOrders' >> beam.Map(lambda element: (element[1]['gender'][0], (element[1]['orders'][0], 1)))
    )

    average_by_gender = (
        gender_order_counts
        | 'Combine' >> beam.CombinePerKey(
            lambda values: (sum(v[0] for v in values), sum(v[1] for v in values))
        )
        | 'CalculateAverage' >> beam.Map(lambda element: (element[0], element[1][0] / element[1][1]))
    )

    return p

# Step 4: Create the pipeline object and display its graph
print("Generating pipeline graph...")
pipeline_to_graph = define_pipeline_for_graphing()

# Use interactive_beam's show_graph() function to render the DAG
ib.show_graph(pipeline_to_graph)

Collecting facets-overview<2,>=1.1.0 (from apache-beam[interactive])
  Downloading facets_overview-1.1.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting ipywidgets<9,>=8 (from apache-beam[interactive])
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting timeloop<2,>=1.0.2 (from apache-beam[interactive])
  Downloading timeloop-1.0.2.tar.gz (2.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython<9,>=7->apache-beam[interactive])
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting comm>=0.1.3 (from ipywidgets<9,>=8->apache-beam[interactive])
  Downloading comm-0.2.3-py3-none-any.whl.metadata (3.7 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets<9,>=8->apache-beam[interactive])
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of grpcio-status to determine which version is compatible with other requirements. This could take a

Generating pipeline graph...


In [1]:
# Question 4.5: Perform a transformation that groups users into age groups [16-26), [26-36), [36-46), [46-56), and
# determine the total number of orders placed by customers in each age group.
# Step 1: Install Apache Beam
# !pip install apache-beam

# Step 2: Run the analysis pipeline
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

def orders_by_age_group():
    """
    Groups users into age brackets and determines the total number of orders for each group.
    """
    options = PipelineOptions(runner='DirectRunner')

    # Helper function to assign an age to a group
    def get_age_group(age):
        if 16 <= age < 26:
            return '[16-26)'
        elif 26 <= age < 36:
            return '[26-36)'
        elif 36 <= age < 46:
            return '[36-46)'
        elif 46 <= age < 56:
            return '[46-56)'
        else:
            return None # Users outside these age groups will be ignored

    with beam.Pipeline(options=options) as p:
        # 1. Read users data and assign each user to an age group
        users_age_group = (
            p
            | 'ReadUsers' >> beam.io.ReadFromText('users_v.csv', skip_header_lines=1)
            | 'SplitUsers' >> beam.Map(lambda line: line.split(','))
            | 'FilterBlankUsers' >> beam.Filter(lambda fields: len(fields) > 3)
            | 'MapUserToAgeGroup' >> beam.Map(lambda fields: (fields[0], get_age_group(int(fields[3]))))
            | 'FilterValidAgeGroups' >> beam.Filter(lambda kv: kv[1] is not None)
        ) # PCollection of (user_id, age_group)

        # 2. Read orders data and count the number of orders per user
        orders_per_user = (
            p
            | 'ReadOrders' >> beam.io.ReadFromText('orders_v_2022.csv', skip_header_lines=1)
            | 'SplitOrders' >> beam.Map(lambda line: line.split(';'))
            | 'FilterBlankOrders' >> beam.Filter(lambda fields: len(fields) > 1)
            | 'MapUserToOrder' >> beam.Map(lambda fields: (fields[1], 1))
            | 'CountOrdersPerUser' >> beam.CombinePerKey(sum)
        ) # PCollection of (user_id, total_orders_for_user)

        # 3. Join the two datasets by user_id
        joined_data = (
            {'age_group': users_age_group, 'orders': orders_per_user}
            | 'CoGroup' >> beam.CoGroupByKey()
            | 'FilterValidJoins' >> beam.Filter(lambda element: len(element[1]['age_group']) > 0 and len(element[1]['orders']) > 0)
        ) # PCollection of (user_id, {'age_group': [...], 'orders': [...]})

        # 4. Sum the total orders for each age group
        orders_by_group = (
            joined_data
            | 'MapAgeGroupToOrders' >> beam.Map(lambda element: (element[1]['age_group'][0], element[1]['orders'][0]))
            | 'SumOrdersByAgeGroup' >> beam.CombinePerKey(sum)
        ) # PCollection of (age_group, total_orders_for_group)

        # 5. Format and print the results
        (
            orders_by_group
            | 'SortResults' >> beam.transforms.combiners.ToList() # Using ToList to sort in memory for printing
            | 'FormatAndPrint' >> beam.Map(lambda results: print("Total orders by age group:\n" + "\n".join(f"- {age_group}: {orders}" for age_group, orders in sorted(results))))
        )

# Run the pipeline
orders_by_age_group()



Total orders by age group:
- [16-26): 212360
- [26-36): 302983
- [36-46): 292033
- [46-56): 260480


In [2]:
# Question 4.6 Determine the total number of times that spinach was purchased within the [16-26), [26-36), [36-46),
# [46-56) age groups.
# Step 1: Install Apache Beam
!pip install apache-beam

# Step 2: Import libraries and run the analysis pipeline
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

def spinach_purchases_by_age_group():
    """
    Determines the total number of times Spinach was purchased by customers in specific age groups.
    """
    options = PipelineOptions(runner='DirectRunner')

    # Helper function to assign an age to a group
    def get_age_group(age):
        if 16 <= age < 26:
            return '[16-26)'
        elif 26 <= age < 36:
            return '[26-36)'
        elif 36 <= age < 46:
            return '[36-46)'
        elif 46 <= age < 56:
            return '[46-56)'
        else:
            return None # Users outside these age groups will be ignored

    with beam.Pipeline(options=options) as p:
        # 1. Read users data and assign each user to an age group
        users_age_group = (
            p
            | 'ReadUsers' >> beam.io.ReadFromText('users_v.csv', skip_header_lines=1)
            | 'SplitUsers' >> beam.Map(lambda line: line.split(','))
            | 'FilterBlankUsers' >> beam.Filter(lambda fields: len(fields) > 3)
            | 'MapUserToAgeGroup' >> beam.Map(lambda fields: (fields[0], get_age_group(int(fields[3]))))
            | 'FilterValidAgeGroups' >> beam.Filter(lambda kv: kv[1] is not None)
        ) # PCollection of (user_id, age_group)

        # 2. Read orders data and filter for Spinach purchases
        spinach_orders = (
            p
            | 'ReadOrders' >> beam.io.ReadFromText('orders_v_2022.csv', skip_header_lines=1)
            | 'SplitOrders' >> beam.Map(lambda line: line.split(';'))
            | 'FilterBlankOrders' >> beam.Filter(lambda fields: len(fields) > 2)
            # Flatten the product list to check each product individually
            | 'FlattenProducts' >> beam.FlatMap(lambda fields: [(fields[1], product.strip()) for product in fields[2].split(',')])
            | 'FilterForSpinach' >> beam.Filter(lambda kv: kv[1] == 'Spinach')
            | 'MapSpinachPurchase' >> beam.Map(lambda kv: (kv[0], 1))
        ) # PCollection of (user_id, 1) for each spinach purchase

        # 3. Join the two datasets by user_id
        joined_data = (
            {'age_group': users_age_group, 'spinach_orders': spinach_orders}
            | 'CoGroup' >> beam.CoGroupByKey()
            | 'FilterValidJoins' >> beam.Filter(lambda element: len(element[1]['age_group']) > 0 and len(element[1]['spinach_orders']) > 0)
        ) # PCollection of (user_id, {'age_group': [...], 'spinach_orders': [...]})

        # 4. Sum the total spinach orders for each age group
        spinach_by_group = (
            joined_data
            # For each user, create a pair of (age_group, number_of_spinach_orders)
            | 'MapAgeGroupToSpinachCount' >> beam.Map(lambda element: (element[1]['age_group'][0], len(element[1]['spinach_orders'])))
            | 'SumSpinachByAgeGroup' >> beam.CombinePerKey(sum)
        ) # PCollection of (age_group, total_spinach_orders_for_group)

        # 5. Format and print the results
        (
            spinach_by_group
            | 'SortResults' >> beam.transforms.combiners.ToList()
            | 'FormatAndPrint' >> beam.Map(lambda results: print("Total Spinach purchases by age group:\n" + "\n".join(f"- {age_group}: {orders}" for age_group, orders in sorted(results))))
        )

# Run the pipeline
spinach_purchases_by_age_group()





Total Spinach purchases by age group:
- [16-26): 9155
- [26-36): 13109
- [36-46): 12493
- [46-56): 11335


In [16]:
# Question 7: Write a pipeline which calculates the average number of orders over seven and thirty day rolling windows
# (where the period of each window is one day).
# Step 1: Install Apache Beam in the Colab environment
!pip install apache-beam

# Step 2: Import libraries and run the analysis pipeline
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.window import SlidingWindows, GlobalWindows
import datetime

def is_valid_date(element):
    """Checks if a date string can be parsed correctly."""
    try:
        datetime.datetime.strptime(element[0], "%Y-%m-%d")
        return True
    except (ValueError, TypeError):
        return False

def calculate_mean(elements):
    """Manually calculates the mean from an iterable, handling empty windows."""
    key, values = elements
    if not values:
        return
    yield sum(values) / len(values)

def rolling_order_averages():
    """
    Calculates the 7-day and 30-day rolling average of orders.
    """
    options = PipelineOptions(runner='DirectRunner')

    with beam.Pipeline(options=options) as p:
        # 1. Read and prepare daily order counts
        daily_orders = (
            p
            | 'ReadOrders' >> beam.io.ReadFromText('orders_v_2022.csv', skip_header_lines=1)
            | 'SplitOrders' >> beam.Map(lambda line: line.split(';'))
            | 'FilterBlankOrders' >> beam.Filter(lambda fields: len(fields) > 3)
            | 'MapDateToOne' >> beam.Map(lambda fields: (fields[3].strip(), 1))
            | 'CountPerDay' >> beam.CombinePerKey(sum)
            | 'FilterValidDates' >> beam.Filter(is_valid_date)
        )

        # 2. Add timestamps for windowing
        timestamped_orders = (
            daily_orders
            | 'AddTimestamps' >> beam.Map(lambda kv: beam.window.TimestampedValue(kv[1], int(datetime.datetime.strptime(kv[0], "%Y-%m-%d").timestamp())))
        )

        # 3. Calculate 7-day rolling average
        seven_day_average = (
            timestamped_orders
            | '7DayWindow' >> beam.WindowInto(SlidingWindows(size=7 * 24 * 60 * 60, period=24 * 60 * 60))
            | 'AddDummyKey7' >> beam.Map(lambda x: (None, x))
            | 'GroupByKey7' >> beam.GroupByKey()
            | '7DayMean' >> beam.FlatMap(calculate_mean)
            | 'Format7DayOutput' >> beam.Map(lambda avg, window=beam.DoFn.WindowParam:
                (window.end.to_utc_datetime().strftime("%Y-%m-%d"), f"7-Day Rolling Avg: {avg:.2f}"))
            # --- THIS IS THE CRITICAL FIX ---
            # Reset to the global window so the CoGroupByKey can match keys correctly.
            | 'ResetToGlobalWindow7' >> beam.WindowInto(GlobalWindows())
        )

        # 4. Calculate 30-day rolling average
        thirty_day_average = (
            timestamped_orders
            | '30DayWindow' >> beam.WindowInto(SlidingWindows(size=30 * 24 * 60 * 60, period=24 * 60 * 60))
            | 'AddDummyKey30' >> beam.Map(lambda x: (None, x))
            | 'GroupByKey30' >> beam.GroupByKey()
            | '30DayMean' >> beam.FlatMap(calculate_mean)
            | 'Format30DayOutput' >> beam.Map(lambda avg, window=beam.DoFn.WindowParam:
                (window.end.to_utc_datetime().strftime("%Y-%m-%d"), f"30-Day Rolling Avg: {avg:.2f}"))
            # --- THIS IS THE CRITICAL FIX ---
            # Reset to the global window so the CoGroupByKey can match keys correctly.
            | 'ResetToGlobalWindow30' >> beam.WindowInto(GlobalWindows())
        )

        # 5. CoGroup and print results
        (
            {'7-day': seven_day_average, '30-day': thirty_day_average}
            | 'CoGroup by Date' >> beam.CoGroupByKey()
            | 'FilterAndSort' >> beam.transforms.combiners.ToList()
            | 'PrintResults' >> beam.Map(lambda results: [print(f"As of {date}: {vals['7-day'][0]} | {vals['30-day'][0]}") for date, vals in sorted(results) if vals['7-day'] and vals['30-day']])
        )

# Run the pipeline
rolling_order_averages()





As of 2000-01-02: 7-Day Rolling Avg: 147.00 | 30-Day Rolling Avg: 147.00
As of 2000-01-03: 7-Day Rolling Avg: 80.50 | 30-Day Rolling Avg: 80.50
As of 2000-01-04: 7-Day Rolling Avg: 59.00 | 30-Day Rolling Avg: 59.00
As of 2000-01-05: 7-Day Rolling Avg: 124.75 | 30-Day Rolling Avg: 124.75
As of 2000-01-06: 7-Day Rolling Avg: 185.60 | 30-Day Rolling Avg: 185.60
As of 2000-01-07: 7-Day Rolling Avg: 176.50 | 30-Day Rolling Avg: 176.50
As of 2000-01-08: 7-Day Rolling Avg: 176.50 | 30-Day Rolling Avg: 176.50
As of 2000-01-09: 7-Day Rolling Avg: 154.33 | 30-Day Rolling Avg: 153.29
As of 2000-01-10: 7-Day Rolling Avg: 170.50 | 30-Day Rolling Avg: 148.00
As of 2000-01-11: 7-Day Rolling Avg: 171.33 | 30-Day Rolling Avg: 133.89
As of 2000-01-12: 7-Day Rolling Avg: 170.67 | 30-Day Rolling Avg: 152.30
As of 2000-01-13: 7-Day Rolling Avg: 153.00 | 30-Day Rolling Avg: 167.82
As of 2000-01-14: 7-Day Rolling Avg: 230.17 | 30-Day Rolling Avg: 203.33
As of 2000-01-15: 7-Day Rolling Avg: 232.57 | 30-Day Ro

In [17]:
# Question 8: Write a pipeline which calculates the average number of orders over seven and thirty day rolling windows
# (where the period of each window is one day).
# Step 1: Install Apache Beam and PyArrow
!pip install apache-beam pyarrow

# Step 2: Import libraries and run the analysis pipeline
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.window import SlidingWindows, GlobalWindows
import datetime
import pyarrow

def is_valid_date(element):
    """Checks if a date string can be parsed correctly."""
    try:
        datetime.datetime.strptime(element[0], "%Y-%m-%d")
        return True
    except (ValueError, TypeError):
        return False

def calculate_mean(elements):
    """Manually calculates the mean from an iterable, handling empty windows."""
    key, values = elements
    if not values:
        return
    yield sum(values) / len(values)

def rolling_order_averages_to_parquet():
    """
    Calculates 7-day and 30-day rolling order averages and writes the output to a Parquet file.
    """
    options = PipelineOptions(runner='DirectRunner')

    with beam.Pipeline(options=options) as p:
        # 1. Read and prepare daily order counts
        daily_orders = (
            p
            | 'ReadOrders' >> beam.io.ReadFromText('orders_v_2022.csv', skip_header_lines=1)
            | 'SplitOrders' >> beam.Map(lambda line: line.split(';'))
            | 'FilterBlankOrders' >> beam.Filter(lambda fields: len(fields) > 3)
            | 'MapDateToOne' >> beam.Map(lambda fields: (fields[3].strip(), 1))
            | 'CountPerDay' >> beam.CombinePerKey(sum)
            | 'FilterValidDates' >> beam.Filter(is_valid_date)
        )

        # 2. Add timestamps for windowing
        timestamped_orders = (
            daily_orders
            | 'AddTimestamps' >> beam.Map(lambda kv: beam.window.TimestampedValue(kv[1], int(datetime.datetime.strptime(kv[0], "%Y-%m-%d").timestamp())))
        )

        # 3. Calculate 7-day rolling average
        seven_day_average = (
            timestamped_orders
            | '7DayWindow' >> beam.WindowInto(SlidingWindows(size=7 * 24 * 60 * 60, period=24 * 60 * 60))
            | 'AddDummyKey7' >> beam.Map(lambda x: (None, x))
            | 'GroupByKey7' >> beam.GroupByKey()
            | '7DayMean' >> beam.FlatMap(calculate_mean)
            | 'Format7DayOutput' >> beam.Map(lambda avg, window=beam.DoFn.WindowParam:
                (window.end.to_utc_datetime().strftime("%Y-%m-%d"), float(avg)))
            | 'ResetToGlobalWindow7' >> beam.WindowInto(GlobalWindows())
        )

        # 4. Calculate 30-day rolling average
        thirty_day_average = (
            timestamped_orders
            | '30DayWindow' >> beam.WindowInto(SlidingWindows(size=30 * 24 * 60 * 60, period=24 * 60 * 60))
            | 'AddDummyKey30' >> beam.Map(lambda x: (None, x))
            | 'GroupByKey30' >> beam.GroupByKey()
            | '30DayMean' >> beam.FlatMap(calculate_mean)
            | 'Format30DayOutput' >> beam.Map(lambda avg, window=beam.DoFn.WindowParam:
                (window.end.to_utc_datetime().strftime("%Y-%m-%d"), float(avg)))
            | 'ResetToGlobalWindow30' >> beam.WindowInto(GlobalWindows())
        )

        # 5. CoGroup the results and format for Parquet
        formatted_for_parquet = (
            {'7-day': seven_day_average, '30-day': thirty_day_average}
            | 'CoGroup by Date' >> beam.CoGroupByKey()
            | 'FilterValidJoins' >> beam.Filter(lambda item: item[1]['7-day'] and item[1]['30-day'])
            | 'FormatToDict' >> beam.Map(lambda item: {
                'date': item[0],
                'seven_day_avg': item[1]['7-day'][0],
                'thirty_day_avg': item[1]['30-day'][0]
            })
        )

        # 6. Define the Parquet schema
        schema = pyarrow.schema([
            pyarrow.field('date', pyarrow.string()),
            pyarrow.field('seven_day_avg', pyarrow.float64()),
            pyarrow.field('thirty_day_avg', pyarrow.float64())
        ])

        # 7. Write the formatted PCollection to a Parquet file
        (
            formatted_for_parquet
            | 'WriteToParquet' >> beam.io.WriteToParquet(
                'rolling_averages.parquet',
                schema,
                file_name_suffix='.parquet',
                num_shards=1 # Use 1 to get a single output file
            )
        )

# Run the pipeline
rolling_order_averages_to_parquet()

print("\nPipeline finished. Check for 'rolling_averages.parquet-00000-of-00001.parquet' in the file explorer.")






Pipeline finished. Check for 'rolling_averages.parquet-00000-of-00001.parquet' in the file explorer.
