In [None]:
# Install Java and Spark (updated version)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar -xvf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark pyspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.2-bin-hadoop3"

import findspark
findspark.init()

# Create Spark Session (updated config)
spark = SparkSession.builder \
    .appName("Retail Dataset Creation") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .getOrCreate()


spark-3.5.2-bin-hadoop3/
spark-3.5.2-bin-hadoop3/bin/
spark-3.5.2-bin-hadoop3/bin/sparkR2.cmd
spark-3.5.2-bin-hadoop3/bin/sparkR
spark-3.5.2-bin-hadoop3/bin/load-spark-env.sh
spark-3.5.2-bin-hadoop3/bin/find-spark-home.cmd
spark-3.5.2-bin-hadoop3/bin/pyspark2.cmd
spark-3.5.2-bin-hadoop3/bin/spark-sql.cmd
spark-3.5.2-bin-hadoop3/bin/spark-sql2.cmd
spark-3.5.2-bin-hadoop3/bin/spark-submit2.cmd
spark-3.5.2-bin-hadoop3/bin/spark-class.cmd
spark-3.5.2-bin-hadoop3/bin/spark-submit
spark-3.5.2-bin-hadoop3/bin/spark-shell
spark-3.5.2-bin-hadoop3/bin/spark-sql
spark-3.5.2-bin-hadoop3/bin/load-spark-env.cmd
spark-3.5.2-bin-hadoop3/bin/spark-class
spark-3.5.2-bin-hadoop3/bin/spark-shell2.cmd
spark-3.5.2-bin-hadoop3/bin/spark-submit.cmd
spark-3.5.2-bin-hadoop3/bin/pyspark.cmd
spark-3.5.2-bin-hadoop3/bin/pyspark
spark-3.5.2-bin-hadoop3/bin/sparkR.cmd
spark-3.5.2-bin-hadoop3/bin/docker-image-tool.sh
spark-3.5.2-bin-hadoop3/bin/run-example
spark-3.5.2-bin-hadoop3/bin/spark-class2.cmd
spark-3.5.2-bin-

In [None]:
print(spark.version)

3.5.2


In [None]:
# Helper functions for data generation
def random_date(start_date, end_date):
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    return start_date + timedelta(days=random_number_of_days)

def generate_random_price():
    return round(random.uniform(10.0, 1000.0), 2)

def generate_random_status():
    statuses = ['PENDING', 'SHIPPED', 'DELIVERED', 'CANCELLED', 'RETURNED']
    return random.choice(statuses)

def generate_random_category():
    categories = ['ELECTRONICS', 'CLOTHING', 'HOME GOODS', 'FOOD', 'SPORTS']
    return random.choice(categories)

# Generate customer data
customer_data = []
countries = ['USA', 'Canada', 'Mexico', 'UK', 'Germany', 'France', 'Japan', 'Australia']
for cust_id in range(1, 1001):
    customer_data.append({
        'customer_id': cust_id,
        'name': f'Customer_{cust_id}',
        'email': f'customer_{cust_id}@example.com',
        'country': random.choice(countries),
        'join_date': random_date(datetime(2022, 1, 1), datetime(2024, 3, 17)),
        'is_active': random.random() > 0.1,  # 90% active customers
        'loyalty_points': random.randint(0, 10000)
    })

# Generate product data
product_data = []
for prod_id in range(1, 501):
    product_data.append({
        'product_id': prod_id,
        'name': f'Product_{prod_id}',
        'category': generate_random_category(),
        'price': generate_random_price(),
        'stock_quantity': random.randint(0, 1000),
        'description': f'Description for Product_{prod_id}',
        'created_date': random_date(datetime(2022, 1, 1), datetime(2024, 3, 17))
    })

# Generate order data
order_data = []
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 3, 17)
for order_id in range(1, 5001):
    customer_id = random.randint(1, 1000)
    order_date = random_date(start_date, end_date)
    order_data.append({
        'order_id': order_id,
        'customer_id': customer_id,
        'order_date': order_date,
        'total_amount': generate_random_price() * random.randint(1, 5),
        'status': generate_random_status(),
        'shipping_address': f"{random.randint(1, 1000)} Street_{random.randint(1, 100)}",
        'order_year': order_date.year,
        'order_month': order_date.month
    })

# Generate order items data
order_items_data = []
for order_id in range(1, 5001):
    num_items = random.randint(1, 5)
    for item in range(num_items):
        product_id = random.randint(1, 500)
        order_items_data.append({
            'order_id': order_id,
            'product_id': product_id,
            'quantity': random.randint(1, 10),
            'unit_price': generate_random_price()
        })

# Create DataFrames
customers_df = spark.createDataFrame(customer_data)
products_df = spark.createDataFrame(product_data)
orders_df = spark.createDataFrame(order_data)
order_items_df = spark.createDataFrame(order_items_data)

# Display sample data
print("Customers Sample:")
customers_df.show(5)
print("\nProducts Sample:")
products_df.show(5)
print("\nOrders Sample:")
orders_df.show(5)
print("\nOrder Items Sample:")
order_items_df.show(5)


Customers Sample:
+---------+-----------+--------------------+---------+-------------------+--------------+----------+
|  country|customer_id|               email|is_active|          join_date|loyalty_points|      name|
+---------+-----------+--------------------+---------+-------------------+--------------+----------+
|   France|          1|customer_1@exampl...|     true|2023-01-08 00:00:00|          8776|Customer_1|
|       UK|          2|customer_2@exampl...|     true|2023-09-09 00:00:00|          8707|Customer_2|
|  Germany|          3|customer_3@exampl...|     true|2022-08-27 00:00:00|          4992|Customer_3|
|       UK|          4|customer_4@exampl...|     true|2023-07-20 00:00:00|          2002|Customer_4|
|Australia|          5|customer_5@exampl...|     true|2022-09-14 00:00:00|          8684|Customer_5|
+---------+-----------+--------------------+---------+-------------------+--------------+----------+
only showing top 5 rows


Products Sample:
+----------+------------------

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import random
from datetime import datetime, timedelta, date
from pyspark.sql.functions import col, lit, concat_ws, to_timestamp, expr, explode, size, count, format_number, regexp_replace
from pyspark.sql.types import StructType, StructField, IntegerType, ArrayType, StringType


### Basic Operations (Questions 1-10)


In [None]:
# 1. Time Series Analysis
daily_sales = order_items_df.join(orders_df, "order_id") \
    .join(products_df, "product_id") \
    .join(customers_df, "customer_id") \
    .groupBy(
        date_format("order_date", "yyyy-MM-dd").alias("date"),
        "category",
        "country"
    ) \
    .agg(sum(col("quantity") * col("unit_price")).alias("sales_amount")) \
    .withColumn("formatted_sales", format_number("sales_amount", 2)) \
    .orderBy("date")

daily_sales.show(10, truncate=False)

+----------+-----------+---------+------------------+---------------+
|date      |category   |country  |sales_amount      |formatted_sales|
+----------+-----------+---------+------------------+---------------+
|2023-01-01|ELECTRONICS|France   |285.21            |285.21         |
|2023-01-01|SPORTS     |Australia|5440.120000000001 |5,440.12       |
|2023-01-01|CLOTHING   |France   |738.03            |738.03         |
|2023-01-01|CLOTHING   |Australia|2636.9            |2,636.90       |
|2023-01-01|ELECTRONICS|Canada   |17446.66          |17,446.66      |
|2023-01-01|CLOTHING   |UK       |3865.0200000000004|3,865.02       |
|2023-01-01|FOOD       |Canada   |8320.27           |8,320.27       |
|2023-01-01|FOOD       |Japan    |3824.8500000000004|3,824.85       |
|2023-01-01|HOME GOODS |Canada   |98.6              |98.60          |
|2023-01-01|ELECTRONICS|UK       |19203.72          |19,203.72      |
+----------+-----------+---------+------------------+---------------+
only showing top 10 

In [None]:
# 2. Data Quality Check
# Create employee data with potential quality issues
employee_data = []
for emp_id in range(1, 1001):
    employee_data.append({
        'employee_id': emp_id,
        'email': f'employee_{emp_id}@example.com' if random.random() > 0.1 else 'invalid_email',
        'department_code': f'DPT{random.randint(100, 999)}' if random.random() > 0.05 else None,
        'salary': random.randint(30000, 150000) if random.random() > 0.03 else 999999
    })

employees_df = spark.createDataFrame(employee_data)

# Check data quality issues
invalid_emails = employees_df.filter(~regexp_extract('email', r'^[A-Za-z0-9+_.-]+@(.+)$', 0).isNotNull())
missing_depts = employees_df.filter(employees_df.department_code.isNull())
duplicate_ids = employees_df.groupBy('employee_id').count().filter('count > 1')
invalid_salaries = employees_df.filter(employees_df.salary > 200000)

print("Data Quality Issues:")
print(f"Invalid emails: {invalid_emails.count()}")
print(f"Missing department codes: {missing_depts.count()}")
print(f"Duplicate employee IDs: {duplicate_ids.count()}")
print(f"Invalid salaries: {invalid_salaries.count()}")

Data Quality Issues:
Invalid emails: 0
Missing department codes: 46
Duplicate employee IDs: 0
Invalid salaries: 28


In [None]:
# 3. Schema Evolution
# Create historical customer interactions with evolving schema
schema_v1 = StructType([
    StructField('customer_id', IntegerType(), True),
    StructField('interaction_date', DateType(), True),
    StructField('action', StringType(), True)
])

schema_v2 = StructType([
    StructField('customer_id', IntegerType(), True),
    StructField('interaction_date', DateType(), True),
    StructField('action', StringType(), True),
    StructField('device_type', StringType(), True),  # New field
    StructField('location', StringType(), True)      # New field
])

# Generate data with both schemas
data_v1 = [(1, date(2023, 1, 1), 'view'),
           (2, date(2023, 1, 2), 'purchase')]
data_v2 = [(3, date(2024, 1, 1), 'view', 'mobile', 'NY'),
           (4, date(2024, 1, 2), 'purchase', 'desktop', 'CA')]

df_v1 = spark.createDataFrame(data_v1, schema=schema_v1)
df_v2 = spark.createDataFrame(data_v2, schema=schema_v2)

# Combine dataframes and handle schema evolution
combined_df = df_v1.unionByName(df_v2, allowMissingColumns=True)
combined_df.show()

+-----------+----------------+--------+-----------+--------+
|customer_id|interaction_date|  action|device_type|location|
+-----------+----------------+--------+-----------+--------+
|          1|      2023-01-01|    view|       NULL|    NULL|
|          2|      2023-01-02|purchase|       NULL|    NULL|
|          3|      2024-01-01|    view|     mobile|      NY|
|          4|      2024-01-02|purchase|    desktop|      CA|
+-----------+----------------+--------+-----------+--------+



In [None]:
# 4. Hierarchical Data Processing
# Create organizational structure data
org_data = [
    (1, 'CEO', None),
    (2, 'CTO', 1),
    (3, 'CFO', 1),
    (4, 'Engineering Manager', 2),
    (5, 'Dev Lead', 4),
    (6, 'Developer', 5),
    (7, 'Finance Manager', 3),
    (8, 'Accountant', 7)
]

org_df = spark.createDataFrame(org_data, ['employee_id', 'title', 'manager_id'])

# Initialize the hierarchy DataFrame with top-level employees (manager_id is NULL)
hierarchy_df = org_df.filter(col("manager_id").isNull()) \
    .withColumn("level", lit(0)) \
    .withColumn("path", col("title"))

# Iteratively process the hierarchy
for i in range(1, 10):  # Limit iterations to prevent infinite loops
    next_level_df = org_df.alias("e").join(
        hierarchy_df.alias("h"),
        col("e.manager_id") == col("h.employee_id")
    ).select(
        col("e.employee_id"),
        col("e.title"),
        col("e.manager_id"),
        (col("h.level") + 1).alias("level"),
        concat_ws(" -> ", col("h.path"), col("e.title")).alias("path")
    )

    # Append the next level to the hierarchy DataFrame
    hierarchy_df = hierarchy_df.union(next_level_df)

# Display the hierarchical structure
hierarchy_df.orderBy("level").show(truncate=False)


+-----------+---------------+----------+-----+-----------------------------+
|employee_id|title          |manager_id|level|path                         |
+-----------+---------------+----------+-----+-----------------------------+
|1          |CEO            |NULL      |0    |CEO                          |
|3          |CFO            |1         |1    |CEO -> CFO                   |
|3          |CFO            |1         |1    |CEO -> CFO                   |
|2          |CTO            |1         |1    |CEO -> CTO                   |
|3          |CFO            |1         |1    |CEO -> CFO                   |
|2          |CTO            |1         |1    |CEO -> CTO                   |
|3          |CFO            |1         |1    |CEO -> CFO                   |
|2          |CTO            |1         |1    |CEO -> CTO                   |
|3          |CFO            |1         |1    |CEO -> CFO                   |
|2          |CTO            |1         |1    |CEO -> CTO                   |

In [None]:
# 5. Data Masking
# Create sensitive customer data
sensitive_data = [
    (
        '123-45-6789',
        'john.doe@example.com',
        '123-456-7890',
        '123 Main St, Apt 101, New York, NY 10001'
    ),
    (
        '987-65-4321',
        'jane.smith@example.com',
        '987-654-3210',
        '456 Oak Ave, San Francisco, CA 94105'
    )
]

schema = StructType([
    StructField('ssn', StringType(), True),
    StructField('email', StringType(), True),
    StructField('phone', StringType(), True),
    StructField('address', StringType(), True)
])

sensitive_df = spark.createDataFrame(sensitive_data, schema)

# Mask sensitive data with error handling
try:
    masked_df = sensitive_df.select(
        regexp_replace('ssn', r'(\d{3})-(\d{2})-(\d{4})', 'XXX-XX-$3').alias('masked_ssn'),
        regexp_replace('email', r'(.+)@(.+)', 'masked@$2').alias('masked_email'),
        regexp_replace('phone', r'(\d{3})-(\d{3})-(\d{4})', 'XXX-XXX-$3').alias('masked_phone'),
        regexp_replace('address', r'(.+), (.+), ([A-Z]{2}) \d{5}', 'XXXXX, XXXXX, $3 10001').alias('masked_address')
    )

    print("Original Data:")
    sensitive_df.show(truncate=False)

    print("\nMasked Data:")
    masked_df.show(truncate=False)

except Exception as e:
    print(f"An error occurred: {e}")


Original Data:
+-----------+----------------------+------------+----------------------------------------+
|ssn        |email                 |phone       |address                                 |
+-----------+----------------------+------------+----------------------------------------+
|123-45-6789|john.doe@example.com  |123-456-7890|123 Main St, Apt 101, New York, NY 10001|
|987-65-4321|jane.smith@example.com|987-654-3210|456 Oak Ave, San Francisco, CA 94105    |
+-----------+----------------------+------------+----------------------------------------+


Masked Data:
+-----------+------------------+------------+----------------------+
|masked_ssn |masked_email      |masked_phone|masked_address        |
+-----------+------------------+------------+----------------------+
|XXX-XX-6789|masked@example.com|XXX-XXX-7890|XXXXX, XXXXX, NY 10001|
|XXX-XX-4321|masked@example.com|XXX-XXX-3210|XXXXX, XXXXX, CA 10001|
+-----------+------------------+------------+----------------------+



In [None]:
# 6. Event Timeline Analysis
# Create event log data
event_data = [
    (1, 'login', datetime(2024, 3, 17, 9, 0, 0)),
    (1, 'view_product', datetime(2024, 3, 17, 9, 1, 0)),
    (1, 'add_to_cart', datetime(2024, 3, 17, 9, 2, 0)),
    (1, 'checkout', datetime(2024, 3, 17, 9, 3, 0)),
    (2, 'login', datetime(2024, 3, 17, 9, 4, 0)),
    (2, 'search', datetime(2024, 3, 17, 9, 5, 0)),
    (2, 'view_product', datetime(2024, 3, 17, 9, 6, 0))
]

schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('event_type', StringType(), True),
    StructField('timestamp', TimestampType(), True)
])

events_df = spark.createDataFrame(event_data, schema)

# Calculate time between events
window_spec = Window.partitionBy('user_id').orderBy('timestamp')
events_with_duration = events_df.withColumn(
    'time_spent',
    datediff(lead('timestamp').over(window_spec), 'timestamp')
).withColumn(
    'next_event',
    lead('event_type').over(window_spec)
)

print("Event Timeline Analysis:")
events_with_duration.show()

Event Timeline Analysis:
+-------+------------+-------------------+----------+------------+
|user_id|  event_type|          timestamp|time_spent|  next_event|
+-------+------------+-------------------+----------+------------+
|      1|       login|2024-03-17 09:00:00|         0|view_product|
|      1|view_product|2024-03-17 09:01:00|         0| add_to_cart|
|      1| add_to_cart|2024-03-17 09:02:00|         0|    checkout|
|      1|    checkout|2024-03-17 09:03:00|      NULL|        NULL|
|      2|       login|2024-03-17 09:04:00|         0|      search|
|      2|      search|2024-03-17 09:05:00|         0|view_product|
|      2|view_product|2024-03-17 09:06:00|      NULL|        NULL|
+-------+------------+-------------------+----------+------------+



In [None]:
# 7. Geographic Data Processing
# Create location-based sales data
sales_data = [
    (1, 40.7128, -74.0060, 1000.0),  # New York
    (2, 34.0522, -118.2437, 2000.0), # Los Angeles
    (3, 41.8781, -87.6298, 1500.0),  # Chicago
    (4, 29.7604, -95.3698, 1200.0),  # Houston
    (5, 37.7749, -122.4194, 2500.0)   # San Francisco
]

schema = StructType([
    StructField('region_id', IntegerType(), True),
    StructField('latitude', DoubleType(), True),
    StructField('longitude', DoubleType(), True),
    StructField('sales_amount', DoubleType(), True)
])

geo_df = spark.createDataFrame(sales_data, schema)

# Calculate distances and aggregate by region
results = geo_df.select(
    'region_id',
    'sales_amount',
    round('latitude', 2).alias('latitude'),
    round('longitude', 2).alias('longitude'),
    format_number('sales_amount', 2).alias('formatted_sales')
).orderBy('sales_amount', ascending=False)

print("Geographic Sales Analysis:")
results.show()

Geographic Sales Analysis:
+---------+------------+--------+---------+---------------+
|region_id|sales_amount|latitude|longitude|formatted_sales|
+---------+------------+--------+---------+---------------+
|        5|      2500.0|   37.77|  -122.42|       2,500.00|
|        2|      2000.0|   34.05|  -118.24|       2,000.00|
|        3|      1500.0|   41.88|   -87.63|       1,500.00|
|        4|      1200.0|   29.76|   -95.37|       1,200.00|
|        1|      1000.0|   40.71|   -74.01|       1,000.00|
+---------+------------+--------+---------+---------------+



In [None]:
# 8. Audit Trail Analysis
# Create transaction history data
transaction_data = [
    (1, 'order', 1000.0, 'created', datetime(2024, 3, 17, 9, 0, 0)),
    (1, 'order', 1100.0, 'updated', datetime(2024, 3, 17, 9, 1, 0)),
    (2, 'payment', 1000.0, 'created', datetime(2024, 3, 17, 9, 2, 0)),
    (1, 'order', 1200.0, 'updated', datetime(2024, 3, 17, 9, 3, 0)),
    (3, 'refund', 500.0, 'created', datetime(2024, 3, 17, 9, 4, 0))
]

schema = StructType([
    StructField('transaction_id', IntegerType(), True),
    StructField('type', StringType(), True),
    StructField('amount', DoubleType(), True),
    StructField('action', StringType(), True),
    StructField('timestamp', TimestampType(), True)
])

transactions_df = spark.createDataFrame(transaction_data, schema)

# Analyze transaction patterns
transaction_patterns = transactions_df.groupBy(
    'transaction_id',
    'type'
).agg(
    collect_list('action').alias('actions'),
    collect_list('timestamp').alias('timestamps'),
    collect_list('amount').alias('amounts')
).orderBy('transaction_id')

print("Transaction Patterns:")
transaction_patterns.show()

Transaction Patterns:
+--------------+-------+--------------------+--------------------+--------------------+
|transaction_id|   type|             actions|          timestamps|             amounts|
+--------------+-------+--------------------+--------------------+--------------------+
|             1|  order|[created, updated...|[2024-03-17 09:00...|[1000.0, 1100.0, ...|
|             2|payment|           [created]|[2024-03-17 09:02...|            [1000.0]|
|             3| refund|           [created]|[2024-03-17 09:04...|             [500.0]|
+--------------+-------+--------------------+--------------------+--------------------+



In [None]:
# 9. Data Lineage Tracking
# Create Spark session
spark = SparkSession.builder \
    .appName("Data Lineage Tracking") \
    .getOrCreate()

# Create data transformation pipeline
lineage_data = [
    (1, 'source_data', 'raw_data.csv', '2024-03-17 09:00:00', None),
    (2, 'clean_data', 'source_data', '2024-03-17 09:30:00', 1),
    (3, 'transformed_data', 'clean_data', '2024-03-17 10:00:00', 2),
    (4, 'aggregated_data', 'transformed_data', '2024-03-17 10:30:00', 3)
]

schema = StructType([
    StructField('data_id', IntegerType(), True),
    StructField('data_name', StringType(), True),
    StructField('source', StringType(), True),
    StructField('timestamp', StringType(), True),
    StructField('parent_id', IntegerType(), True)
])

lineage_df = spark.createDataFrame(lineage_data, schema)

# Initialize the lineage DataFrame with top-level data (parent_id is None)
lineage_result = lineage_df.filter(col("parent_id").isNull()) \
    .withColumn("level", lit(0)) \
    .withColumn("path", col("data_name"))

# Iteratively process the lineage
for i in range(1, 10):  # Limit iterations to prevent infinite loops
    next_level_df = lineage_df.alias("l").join(
        lineage_result.alias("r"),
        col("l.parent_id") == col("r.data_id")
    ).select(
        col("l.data_id"),
        col("l.data_name"),
        col("l.source"),
        col("l.timestamp"),
        col("l.parent_id"),
        (col("r.level") + 1).alias("level"),
        concat_ws(" -> ", col("r.path"), col("l.data_name")).alias("path")
    )

    # Append the next level to the lineage result DataFrame
    lineage_result = lineage_result.union(next_level_df)

# Display the complete lineage tree
print("Data Lineage:")
lineage_result.orderBy("level").show(truncate=False)


Data Lineage:
+-------+----------------+------------+-------------------+---------+-----+---------------------------------------------+
|data_id|data_name       |source      |timestamp          |parent_id|level|path                                         |
+-------+----------------+------------+-------------------+---------+-----+---------------------------------------------+
|1      |source_data     |raw_data.csv|2024-03-17 09:00:00|NULL     |0    |source_data                                  |
|2      |clean_data      |source_data |2024-03-17 09:30:00|1        |1    |source_data -> clean_data                    |
|2      |clean_data      |source_data |2024-03-17 09:30:00|1        |1    |source_data -> clean_data                    |
|2      |clean_data      |source_data |2024-03-17 09:30:00|1        |1    |source_data -> clean_data                    |
|2      |clean_data      |source_data |2024-03-17 09:30:00|1        |1    |source_data -> clean_data                    |
|2      |c

In [None]:
# 10. Compliance Reporting
# Create compliance data
compliance_data = [
    (1, 'GDPR', 'personal_data', '2024-03-17', True, '2024-03-17 09:00:00'),
    (2, 'HIPAA', 'medical_records', '2024-03-17', False, '2024-03-17 09:30:00'),
    (3, 'PCI-DSS', 'payment_data', '2024-03-17', True, '2024-03-17 10:00:00')
]

schema = StructType([
    StructField('check_id', IntegerType(), True),
    StructField('regulation', StringType(), True),
    StructField('data_type', StringType(), True),
    StructField('date', StringType(), True),
    StructField('compliant', BooleanType(), True),
    StructField('timestamp', StringType(), True)
])

compliance_df = spark.createDataFrame(compliance_data, schema)

# Generate compliance report
compliance_report = compliance_df.groupBy('regulation').agg(
    count('*').alias('total_checks'),
    sum(when(col('compliant') == True, 1).otherwise(0)).alias('compliant_checks'),
    format_number(
        sum(when(col('compliant') == True, 1).otherwise(0)) /
        count('*') * 100,
        2
    ).alias('compliance_rate')
).orderBy('regulation')

print("Compliance Report:")
compliance_report.show()

Compliance Report:
+----------+------------+----------------+---------------+
|regulation|total_checks|compliant_checks|compliance_rate|
+----------+------------+----------------+---------------+
|      GDPR|           1|               1|         100.00|
|     HIPAA|           1|               0|           0.00|
|   PCI-DSS|           1|               1|         100.00|
+----------+------------+----------------+---------------+



### Intermediate Operations (Questions 11-20)


In [None]:
# 1. Session Window Analysis
# Create session data
session_data = [
    (1, 'login', datetime(2024, 3, 17, 9, 0, 0)),
    (1, 'view_product', datetime(2024, 3, 17, 9, 5, 0)),
    (1, 'add_to_cart', datetime(2024, 3, 17, 9, 10, 0)),
    (2, 'login', datetime(2024, 3, 17, 9, 15, 0)),
    (2, 'search', datetime(2024, 3, 17, 9, 20, 0)),
    (2, 'view_product', datetime(2024, 3, 17, 9, 25, 0))
]

schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('action', StringType(), True),
    StructField('timestamp', TimestampType(), True)
])

sessions_df = spark.createDataFrame(session_data, schema)

# Analyze session patterns
window_spec = Window.partitionBy('user_id').orderBy('timestamp')
session_analysis = sessions_df.withColumn(
    'session_duration',
    datediff(lead('timestamp').over(window_spec), 'timestamp')
).withColumn(
    'next_action',
    lead('action').over(window_spec)
)

print("Session Analysis:")
session_analysis.show()

Session Analysis:
+-------+------------+-------------------+----------------+------------+
|user_id|      action|          timestamp|session_duration| next_action|
+-------+------------+-------------------+----------------+------------+
|      1|       login|2024-03-17 09:00:00|               0|view_product|
|      1|view_product|2024-03-17 09:05:00|               0| add_to_cart|
|      1| add_to_cart|2024-03-17 09:10:00|            NULL|        NULL|
|      2|       login|2024-03-17 09:15:00|               0|      search|
|      2|      search|2024-03-17 09:20:00|               0|view_product|
|      2|view_product|2024-03-17 09:25:00|            NULL|        NULL|
+-------+------------+-------------------+----------------+------------+



In [None]:
# 2. Resource Utilization
# Create resource metrics data
resource_data = [
    (1, 'CPU', 75.5, datetime(2024, 3, 17, 9, 0, 0)),
    (1, 'Memory', 82.3, datetime(2024, 3, 17, 9, 0, 0)),
    (1, 'Disk', 65.7, datetime(2024, 3, 17, 9, 0, 0)),
    (2, 'CPU', 45.2, datetime(2024, 3, 17, 9, 5, 0)),
    (2, 'Memory', 67.8, datetime(2024, 3, 17, 9, 5, 0)),
    (2, 'Disk', 58.9, datetime(2024, 3, 17, 9, 5, 0))
]

schema = StructType([
    StructField('server_id', IntegerType(), True),
    StructField('resource_type', StringType(), True),
    StructField('usage_percent', DoubleType(), True),
    StructField('timestamp', TimestampType(), True)
])

resource_df = spark.createDataFrame(resource_data, schema)

# Analyze resource utilization
resource_analysis = resource_df.groupBy(
    'resource_type'
).agg(
    avg('usage_percent').alias('avg_usage'),
    max('usage_percent').alias('peak_usage'),
    min('usage_percent').alias('min_usage')
).orderBy('resource_type')

print("Resource Utilization Analysis:")
resource_analysis.show()

Resource Utilization Analysis:
+-------------+---------+----------+---------+
|resource_type|avg_usage|peak_usage|min_usage|
+-------------+---------+----------+---------+
|          CPU|    60.35|      75.5|     45.2|
|         Disk|     62.3|      65.7|     58.9|
|       Memory|    75.05|      82.3|     67.8|
+-------------+---------+----------+---------+



In [None]:
# 3. **Pattern Recognition**
# Create Spark session
spark = SparkSession.builder \
    .appName("Pattern Recognition") \
    .getOrCreate()

# Correctly formatted transaction sequence data
sequence_data = [
    (1, ['login', 'view_product', 'add_to_cart', 'checkout']),
    (2, ['login', 'search', 'view_product', 'purchase']),
    (3, ['login', 'view_product', 'view_product', 'add_to_cart', 'checkout']),
    (4, ['login', 'search', 'search', 'view_product', 'purchase'])
]

# Define schema
schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('sequence', ArrayType(StringType()), True)
])

# Create DataFrame
sequences_df = spark.createDataFrame(sequence_data, schema)

# Analyze patterns
pattern_analysis = sequences_df.select(
    explode(col('sequence')).alias('action'),
    size(col('sequence')).alias('sequence_length')
).groupBy('action').agg(
    count('*').alias('frequency'),
    format_number((count('*') / sequences_df.count()) * 100, 2).alias('percentage')
).orderBy('frequency', ascending=False)

print("Pattern Analysis:")
pattern_analysis.show(truncate=False)


Pattern Analysis:
+------------+---------+----------+
|action      |frequency|percentage|
+------------+---------+----------+
|view_product|5        |125.00    |
|login       |4        |100.00    |
|search      |3        |75.00     |
|checkout    |2        |50.00     |
|purchase    |2        |50.00     |
|add_to_cart |2        |50.00     |
+------------+---------+----------+



In [None]:
# 4. Network Analysis
# Create network data
network_data = [
    (1, 2, 0.8),  # user_id, connected_user_id, strength
    (1, 3, 0.6),
    (2, 3, 0.9),
    (2, 4, 0.7),
    (3, 4, 0.5)
]

schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('connected_user_id', IntegerType(), True),
    StructField('strength', DoubleType(), True)
])

network_df = spark.createDataFrame(network_data, schema)

# Calculate centrality measures
def calculate_centrality(df):
    # Calculate degree centrality (number of connections)
    degree_df = df.groupBy('user_id').count().withColumnRenamed('count', 'degree')

    # Calculate strength centrality (sum of connection strengths)
    strength_df = df.groupBy('user_id').agg(sum('strength').alias('strength'))

    # Calculate betweenness centrality (number of shortest paths passing through)
    # This is a simplified version for demonstration
    betweenness_df = df.groupBy('user_id').agg(
        count(when(col('strength') > 0.7, 1)).alias('betweenness')
    )

    return degree_df.join(strength_df, 'user_id').join(betweenness_df, 'user_id')

network_metrics = calculate_centrality(network_df)
print("Network Analysis:")
network_metrics.show()

Network Analysis:
+-------+------+--------+-----------+
|user_id|degree|strength|betweenness|
+-------+------+--------+-----------+
|      1|     2|     1.4|          1|
|      2|     2|     1.6|          1|
|      3|     1|     0.5|          0|
+-------+------+--------+-----------+



In [None]:
# 5. Text Processing
# Create text data
text_data = [
    (1, "Great product! Excellent quality and fast shipping."),
    (2, "Poor customer service. Delivery was delayed."),
    (3, "Amazing product! Highly recommended!"),
    (4, "Average experience. Nothing special.")
]

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('text', StringType(), True)
])

text_df = spark.createDataFrame(text_data, schema)

# Process text data
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF

# Tokenize text
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenized_df = tokenizer.transform(text_df)

# Remove stop words
stop_words = StopWordsRemover.loadDefaultStopWords("english")
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_df = stop_words_remover.transform(tokenized_df)

# Calculate term frequencies
cv = CountVectorizer(inputCol="filtered_words", outputCol="features", minDF=2.0)
tf_df = cv.fit(filtered_df).transform(filtered_df)

print("Text Processing Results:")
tf_df.select("id", "filtered_words", "features").show(truncate=False)


Text Processing Results:
+---+------------------------------------------------------+-------------+
|id |filtered_words                                        |features     |
+---+------------------------------------------------------+-------------+
|1  |[great, product!, excellent, quality, fast, shipping.]|(1,[0],[1.0])|
|2  |[poor, customer, service., delivery, delayed.]        |(1,[],[])    |
|3  |[amazing, product!, highly, recommended!]             |(1,[0],[1.0])|
|4  |[average, experience., nothing, special.]             |(1,[],[])    |
+---+------------------------------------------------------+-------------+



In [None]:
# 6. Event Timeline Analysis
# Create global event data
event_data = [
    (1, "2024-03-17 09:00:00", "America/New_York"),
    (2, "2024-03-17 10:00:00", "Europe/London"),
    (3, "2024-03-17 11:00:00", "Asia/Tokyo")
]

schema = StructType([
    StructField('event_id', IntegerType(), True),
    StructField('timestamp', StringType(), True),
    StructField('timezone', StringType(), True)
])

events_df = spark.createDataFrame(event_data, schema)

# Convert timestamps to local time
events_df = events_df.withColumn(
    'local_time',
    expr("to_utc_timestamp(to_timestamp(timestamp), timezone)")
)

print("Time Zone Handling Results:")
events_df.show(truncate=False)


Time Zone Handling Results:
+--------+-------------------+----------------+-------------------+
|event_id|timestamp          |timezone        |local_time         |
+--------+-------------------+----------------+-------------------+
|1       |2024-03-17 09:00:00|America/New_York|2024-03-17 13:00:00|
|2       |2024-03-17 10:00:00|Europe/London   |2024-03-17 10:00:00|
|3       |2024-03-17 11:00:00|Asia/Tokyo      |2024-03-17 02:00:00|
+--------+-------------------+----------------+-------------------+



In [None]:
# 7. Data Standardization
# Create data with different formats
data = [
    ("1,234.56", "2024-03-17", "1,000", "123.45"),
    ("2,345.78", "2024/03/18", "2,000", "234.56"),
    ("3,456.90", "2024-03-19", "3,000", "345.67")
]

schema = StructType([
    StructField('amount1', StringType(), True),
    StructField('date', StringType(), True),
    StructField('amount2', StringType(), True),
    StructField('amount3', StringType(), True)
])

df = spark.createDataFrame(data, schema)

# Standardize data formats
from pyspark.sql.functions import regexp_replace, to_date, cast

standardized_df = df.select(
    regexp_replace('amount1', ',', '').cast('double').alias('amount1'),
    to_date('date', 'yyyy-MM-dd').alias('date'),
    regexp_replace('amount2', ',', '').cast('double').alias('amount2'),
    regexp_replace('amount3', ',', '').cast('double').alias('amount3')
)

print("Data Standardization Results:")
standardized_df.show()

Data Standardization Results:
+-------+----------+-------+-------+
|amount1|      date|amount2|amount3|
+-------+----------+-------+-------+
|1234.56|2024-03-17| 1000.0| 123.45|
|2345.78|      NULL| 2000.0| 234.56|
| 3456.9|2024-03-19| 3000.0| 345.67|
+-------+----------+-------+-------+



In [None]:
# 8. Cache Optimization
# Create sample data
from pyspark import StorageLevel
data = [(i, f"data_{i}") for i in range(100000)]
df = spark.createDataFrame(data, ["id", "value"])

# Different caching strategies
from pyspark.sql.functions import *

# 1. Basic caching
df.cache()
print("1. Basic caching applied")

# 2. Memory only caching
df.persist(StorageLevel.MEMORY_ONLY)
print("2. Memory only caching applied")

# 3. Memory and disk caching
df.persist(StorageLevel.MEMORY_AND_DISK)
print("3. Memory and disk caching applied")

# 4. Memory only with 2 replication
df.rdd.persist(StorageLevel.MEMORY_ONLY_2)
print("4. Memory only with 2 replication applied")

1. Basic caching applied
2. Memory only caching applied
3. Memory and disk caching applied
4. Memory only with 2 replication applied


In [None]:
# 9. Query Optimization
# Create sample data
data = [(i, f"data_{i}") for i in range(100000)]
df = spark.createDataFrame(data, ["id", "value"])

# 1. Using broadcast variables for joins
small_df = spark.createDataFrame([(1, "small_data")], ["id", "value"])
broadcast_df = broadcast(small_df)
optimized_join = df.join(broadcast_df, "id")

# 2. Using predicate pushdown
filtered_df = df.filter(col("id") > 50000).select("id", "value")

# 3. Using partition pruning
df.write.partitionBy("id").parquet("partitioned_data")

# # 4. Using bucketing
df.write.bucketBy(10, "id").sortBy("id").saveAsTable("bucketed_table")

In [None]:

# 10. Data Validation Rules
# Create sample data
data = [
    (1, "invalid.email", 50000),
    (2, "valid@email.com", 150000),
    (3, "another.invalid", 250000),
    (4, "valid2@email.com", 30000)
]

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('email', StringType(), True),
    StructField('salary', IntegerType(), True)
])

df = spark.createDataFrame(data, schema)

# Define validation rules
from pyspark.sql.functions import *

validation_rules = {
    'email_format': lambda x: x.email.rlike(r'^[A-Za-z0-9+_.-]+@(.+)$'),
    'salary_range': lambda x: x.salary.between(30000, 150000)
}

# Apply validation rules
validation_results = df.select(
    '*',
    *[when(validation_rules[rule](df), 'valid').otherwise('invalid').alias(rule)
      for rule in validation_rules]
)

print("Validation Results:")
validation_results.show()

Validation Results:
+---+----------------+------+------------+------------+
| id|           email|salary|email_format|salary_range|
+---+----------------+------+------------+------------+
|  1|   invalid.email| 50000|     invalid|       valid|
|  2| valid@email.com|150000|       valid|       valid|
|  3| another.invalid|250000|     invalid|     invalid|
|  4|valid2@email.com| 30000|       valid|       valid|
+---+----------------+------+------------+------------+

