In [2]:
# Question 4.1 Join the input files as streams using CoGroupByKey
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

def analyze_purchases():
    """
    This pipeline analyzes customer purchasing behavior by combining user and order data.
    """
    options = PipelineOptions(runner='DirectRunner')

    with beam.Pipeline(options=options) as p:
        # 1. Read and parse users_v.csv
        users = (
            p
            | 'ReadUsers' >> beam.io.ReadFromText('users_v.csv', skip_header_lines=1)
            | 'SplitUsers' >> beam.Map(lambda line: line.split(','))
            | 'FilterBlankUsers' >> beam.Filter(lambda fields: len(fields) > 5)
            | 'FormatUsers' >> beam.Map(lambda fields: (fields[0], {
                'name': fields[1],
                'gender': fields[2],
                'age': int(fields[3]),
                'address': fields[4],
                'date_joined': fields[5]
            }))
        )

        # 2. Read and parse orders_v_2022.csv
        orders = (
            p
            | 'ReadOrders' >> beam.io.ReadFromText('orders_v_2022.csv', skip_header_lines=1)
            | 'SplitOrders' >> beam.Map(lambda line: line.split(';'))
            | 'FilterBlankOrders' >> beam.Filter(lambda fields: len(fields) > 3)
            | 'FormatOrders' >> beam.Map(lambda fields: (fields[1], {
                'order_no': fields[0],
                'product_list': [product.strip() for product in fields[2].split(',')],
                'date_purchased': fields[3]
            }))
        )

        # 3. Join the PCollections
        combined_data = (
            {'users': users, 'orders': orders}
            | 'GroupByUser' >> beam.CoGroupByKey()
            | 'FilterEmptyOrders' >> beam.Filter(lambda element: len(element[1]['orders']) > 0 and len(element[1]['users']) > 0)
        )

        # 4. Perform Analytics

        # Insight 1: Popular Products
        product_counts = (
            combined_data
            | 'ExtractProducts' >> beam.FlatMap(
                lambda element: [(product, 1) for order in element[1]['orders'] for product in order['product_list']]
            )
            | 'CountProducts' >> beam.CombinePerKey(sum)
            | 'SortProducts' >> beam.transforms.combiners.Top.Of(10, key=lambda x: x[1])
        )

        # Insight 2: Purchase behavior by gender
        gender_purchases = (
            combined_data
            | 'ExtractGenderProducts' >> beam.FlatMap(
                lambda element: [
                    ((element[1]['users'][0]['gender'], product), 1)
                    for order in element[1]['orders']
                    for product in order['product_list']
                ]
            )
            | 'CountGenderProducts' >> beam.CombinePerKey(sum)
            | 'RegroupByGender' >> beam.Map(lambda element: (element[0][0], (element[0][1], element[1])))
            | 'GroupProductsByGender' >> beam.GroupByKey()
            | 'Top5ProductsByGender' >> beam.Map(
                lambda element: (element[0], sorted(element[1], key=lambda x: x[1], reverse=True)[:5])
            )
        )

        # Insight 3: Purchase behavior by age group
        def get_age_group(age):
            if 18 <= age <= 24: return '18-24'
            elif 25 <= age <= 34: return '25-34'
            elif 35 <= age <= 44: return '35-44'
            elif 45 <= age <= 54: return '45-54'
            elif 55 <= age <= 64: return '55-64'
            else: return '65+'

        age_group_purchases = (
            combined_data
            | 'ExtractAgeGroupProducts' >> beam.FlatMap(
                lambda element: [
                    ((get_age_group(element[1]['users'][0]['age']), product), 1)
                    for order in element[1]['orders']
                    for product in order['product_list']
                ]
            )
            | 'CountAgeGroupProducts' >> beam.CombinePerKey(sum)
            | 'RegroupByAgeGroup' >> beam.Map(lambda element: (element[0][0], (element[0][1], element[1])))
            | 'GroupProductsByAgeGroup' >> beam.GroupByKey()
            | 'Top5ProductsByAgeGroup' >> beam.Map(
                lambda element: (element[0], sorted(element[1], key=lambda x: x[1], reverse=True)[:5])
            )
        )

        # Insight 4: Location-based analysis (by State)
        # --- THIS IS THE CORRECTED FUNCTION ---
        def get_state(address):
            try:
                # This will attempt to get the state as before
                parts = address.split('-')
                return parts[-2]
            except IndexError:
                # If the address format is wrong, it will return 'Unknown'
                # instead of crashing the program.
                return 'Unknown'

        state_purchases = (
            combined_data
            | 'ExtractStateProducts' >> beam.FlatMap(
                lambda element: [
                    ((get_state(element[1]['users'][0]['address']), product), 1)
                    for order in element[1]['orders']
                    for product in order['product_list']
                ]
            )
            | 'CountStateProducts' >> beam.CombinePerKey(sum)
            | 'RegroupByState' >> beam.Map(lambda element: (element[0][0], (element[0][1], element[1])))
            | 'GroupProductsByState' >> beam.GroupByKey()
            | 'Top5ProductsByState' >> beam.Map(
                lambda element: (element[0], sorted(element[1], key=lambda x: x[1], reverse=True)[:5])
            )
        )

        # 5. Print results to the console
        product_counts | 'FormatProductCounts' >> beam.Map(lambda top_list: "Top 10 most popular products:\n" + "\n".join(f"- {p}: {c}" for p, c in top_list[0])) | 'PrintProductCounts' >> beam.Map(print)
        gender_purchases | 'FormatGenderPurchases' >> beam.Map(lambda x: f"\nTop 5 products for {x[0]}:\n" + "\n".join(f"- {p}: {c}" for p, c in x[1])) | 'PrintGenderPurchases' >> beam.Map(print)
        age_group_purchases | 'FormatAgeGroupPurchases' >> beam.Map(lambda x: f"\nTop 5 products for age group {x[0]}:\n" + "\n".join(f"- {p}: {c}" for p, c in x[1])) | 'PrintAgeGroupPurchases' >> beam.Map(print)
        state_purchases | 'FormatStatePurchases' >> beam.Map(lambda x: f"\nTop 5 products for state {x[0]}:\n" + "\n".join(f"- {p}: {c}" for p, c in x[1])) | 'PrintStatePurchases' >> beam.Map(print)

# Run the pipeline
analyze_purchases()

ERROR:apache_beam.runners.common:too many values to unpack (expected 2) [while running 'FormatProductCounts']
Traceback (most recent call last):
  File "apache_beam/runners/common.py", line 1498, in apache_beam.runners.common.DoFnRunner.process
  File "apache_beam/runners/common.py", line 685, in apache_beam.runners.common.SimpleInvoker.invoke_process
  File "/usr/local/lib/python3.12/dist-packages/apache_beam/transforms/core.py", line 2098, in <lambda>
    wrapper = lambda x: [fn(x)]
                         ^^^^^
  File "/tmp/ipython-input-1492020752.py", line 132, in <lambda>
    product_counts | 'FormatProductCounts' >> beam.Map(lambda top_list: "Top 10 most popular products:\n" + "\n".join(f"- {p}: {c}" for p, c in top_list[0])) | 'PrintProductCounts' >> beam.Map(print)
                                                                                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-1492020752.py", line 132, in <


Top 5 products for female:
- Swiss Chard: 37868
- Kale: 37655
- Spinach: 37532
- Arugula: 19470
- Tomato: 19208

Top 5 products for male:
- Swiss Chard: 39951
- Kale: 39746
- Spinach: 39524
- Parsnip: 20147
- Cabbage: 20123

Top 5 products for age group 55-64:
- Swiss Chard: 12630
- Kale: 12323
- Spinach: 12220
- Radicchio: 6352
- Jícama: 6334

Top 5 products for age group 65+:
- Swiss Chard: 20021
- Spinach: 19861
- Kale: 19854
- Tomatillo: 10186
- String Beans: 10175

Top 5 products for age group 35-44:
- Kale: 13167
- Swiss Chard: 12954
- Spinach: 12892
- Cassava: 6738
- Arugula: 6671

Top 5 products for age group 25-34:
- Kale: 12928
- Swiss Chard: 12926
- Spinach: 12918
- Green Beans: 6702
- Olive: 6675

Top 5 products for age group 45-54:
- Swiss Chard: 11389
- Spinach: 11362
- Kale: 11339
- Beans: 5843
- Bell Pepper: 5837

Top 5 products for age group 18-24:
- Swiss Chard: 7899
- Spinach: 7803
- Kale: 7790
- Shallots: 4171
- Pumpkin: 4131

Top 5 products for state MN:
- Swiss C

ValueError: too many values to unpack (expected 2) [while running 'FormatProductCounts']