<a href="https://colab.research.google.com/github/18708064/postblock1-774/blob/main/PostBlock1_Big_Data_Technologies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Apache Beam Analytics**

Installing and importing the Beam

In [59]:
!pip install apache-beam




Importing  Libraries

In [60]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
import pandas as pd

Accessing Files from Github

In [61]:
import pandas as pd

# URLs to the raw CSV files on GitHub
orders_url = 'https://raw.githubusercontent.com/18708064/postblock1-774/main/orders.csv'
users_url = 'https://raw.githubusercontent.com/18708064/postblock1-774/main/users.csv'

# Download the files using wget
!wget -O users.csv {users_url}
!wget -O orders.csv {orders_url}

--2024-09-28 14:20:30--  https://raw.githubusercontent.com/18708064/postblock1-774/main/users.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 143675 (140K) [text/plain]
Saving to: ‘users.csv’


2024-09-28 14:20:30 (4.03 MB/s) - ‘users.csv’ saved [143675/143675]

--2024-09-28 14:20:30--  https://raw.githubusercontent.com/18708064/postblock1-774/main/orders.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73312907 (70M) [text/plain]
Saving to: ‘orders.csv’


2024-09-28 14:20:31 (171 MB/s) - ‘orders.csv’ saved [73312907/733129

Inspect the Data

In [62]:
# Read the CSV files into DataFrames
users_df = pd.read_csv('users.csv', delimiter=',')
orders_df = pd.read_csv('orders.csv', delimiter=';')

print("Users DataFrame:")
print(users_df.head())

print("\nOrders DataFrame:")
print(orders_df.head())

Users DataFrame:
   user_id             name  gender  age                     address  \
0        1     Anthony Wolf    male   73    New Rachelburgh-VA-49583   
1        2  James Armstrong    male   56  North Jillianfort-UT-86454   
2        3        Cody Shaw    male   75         North Anne-SC-53799   
3        4  Sierra Hamilton  female   76     New Angelafurt-ME-46190   
4        5      Chase Davis    male   31    South Bethmouth-WI-18562   

  date_joined  
0  2019/03/13  
1  2020/11/06  
2  2004/05/29  
3  2005/08/26  
4  2018/04/30  

Orders DataFrame:
   order_no  user_id             product_list date_purchased
0      1000     1887                  Cassava     2000-01-01
1      1001      838  Calabash, Water Spinach     2000-01-01
2      1002     2032            Onion, Rapini     2000-01-01
3      1003     1482   Swiss Chard, Artichoke     2000-01-01
4      1004      475  Turnip Greens, Plantain     2000-01-01


Defining Parsing Functions

In [63]:
def parse_users(line):
    import csv
    from io import StringIO

    # Read the line as CSV
    reader = csv.reader(StringIO(line), delimiter=',')
    fields = next(reader)

    # Check if the line has the expected number of fields
    if len(fields) != 6:
        return None

    user_id, name, gender, age, address, date_joined = fields
    user_info = {
        'name': name,
        'gender': gender,
        'age': int(age),
        'address': address,
        'date_joined': date_joined
    }
    return (int(user_id), user_info)




oders function

In [64]:
def parse_orders(line):
    import csv
    from io import StringIO

    # Read the line as CSV
    reader = csv.reader(StringIO(line), delimiter=';')
    fields = next(reader)

    # Check if the line has the expected number of fields
    if len(fields) != 4:
        return None

    order_no, user_id, product_list, date_purchased = fields
    order_info = {
        'order_no': order_no,
        'product_list': product_list,
        'date_purchased': date_purchased
    }
    return (int(user_id), order_info)



Create the Beam Pipline

In [65]:
options = PipelineOptions()
with beam.Pipeline(options=options) as p:
    # Read and parse the users data
    users = (
        p
        | 'ReadUsers' >> beam.io.ReadFromText('users.csv', skip_header_lines=1)
        | 'ParseUsers' >> beam.Map(parse_users)
        | 'FilterValidUsers' >> beam.Filter(lambda x: x is not None)
    )

    # Read and parse the orders data
    orders = (
        p
        | 'ReadOrders' >> beam.io.ReadFromText('orders.csv', skip_header_lines=1)
        | 'ParseOrders' >> beam.Map(parse_orders)
        | 'FilterValidOrders' >> beam.Filter(lambda x: x is not None)
    )

    # Join the input files as streams using CoGroupByKey
    joined_data = ({'users': users, 'orders': orders}
                   | 'GroupByUserID' >> beam.CoGroupByKey())

    # Format the result for output
    def format_result(element):
        user_id, grouped_data = element
        users_info = grouped_data['users']
        orders_info = grouped_data['orders']

        user_info = users_info[0] if users_info else {}
        return {
            'user_id': user_id,
            'user_info': user_info,
            'orders': orders_info
        }

    # Collect the output
    output = (
        joined_data
        | 'FormatResult' >> beam.Map(format_result)
        | 'WriteOutput' >> beam.io.WriteToText('joined_output.txt')
    )






Read and display

In [66]:
# Read and display only the first 5 lines of the joined data
print("\nJoined Data (First 5 Lines):")
with open('joined_output.txt-00000-of-00001') as f:
    for i, line in enumerate(f):
        if i >= 5:  # Stop after 5 lines
            break
        print(line.strip())


Joined Data (First 5 Lines):
{'user_id': 1, 'user_info': {'name': 'Anthony Wolf', 'gender': 'male', 'age': 73, 'address': 'New Rachelburgh-VA-49583', 'date_joined': '2019/03/13'}, 'orders': [{'order_no': '4667', 'product_list': 'Burdock Root, Chayote', 'date_purchased': '2000-01-20'}, {'order_no': '6141', 'product_list': 'Bean Sprouts, Peas, Peppers, Horseradish', 'date_purchased': '2000-01-31'}, {'order_no': '9166', 'product_list': 'Mustard Greens, Mushrooms', 'date_purchased': '2000-02-11'}, {'order_no': '10593', 'product_list': 'Edamame, Lemongrass', 'date_purchased': '2000-02-18'}, {'order_no': '11973', 'product_list': 'Amaranth Leaves, Turnip Greens', 'date_purchased': '2000-02-24'}, {'order_no': '12062', 'product_list': 'Bamboo Shoots', 'date_purchased': '2000-02-24'}, {'order_no': '13740', 'product_list': 'Bamboo Shoots, Radish', 'date_purchased': '2000-03-03'}, {'order_no': '15763', 'product_list': 'Taro', 'date_purchased': '2000-03-15'}, {'order_no': '18539', 'product_list': 