#### ❗ 1. How do you manage and process unstructured data in your data pipelines?

In [0]:
from pyspark.sql.functions import *

# Sample Unstructured JSON Log Data (Simulating Incoming Log Files)
json_data = [
    ('{"timestamp":"2024-02-25 12:00:01", "user":"Alice", "event":"login", "ip":"192.168.1.1"}',),
    ('{"timestamp":"2024-02-25 12:05:10", "user":"Bob", "event":"purchase", "amount": 250.50}',),
    ('{"timestamp":"2024-02-25 12:10:20", "user":"Charlie", "event":"logout", "ip":"192.168.1.2"}',)
]

columns = ["raw_json"]

# Creating DataFrame
df = spark.createDataFrame(json_data, columns)

# Step 1: Extracting Key Fields from JSON (Schema Inference)
extracted_df = df.select(
    json_tuple(col("raw_json"), "timestamp", "user", "event", "ip", "amount").alias("timestamp", "user", "event", "ip", "amount")
)

# Step 2: Converting Data Types (Ensure Numerical Fields Are Correct)
structured_df = extracted_df.withColumn("amount", col("amount").cast("double"))

# Show Processed Data
structured_df.show()

#### ❗2. Can you discuss your experience with performance tuning in big data frameworks like Spark or Hadoop?

In [0]:
# Sample Large Orders Dataset (Simulating Big Data)
orders_data = [
    (1, 101, "2024-02-25", 500),
    (2, 102, "2024-02-24", 300),
    (3, 103, "2024-02-23", 700),
    (4, 104, "2024-02-22", 150),
]

orders_columns = ["order_id", "customer_id", "order_date", "amount"]

# Sample Small Customers Dataset (For Join Optimization)
customers_data = [
    (101, "Alice"),
    (102, "Bob"),
    (103, "Charlie"),
    (104, "David"),
]

customers_columns = ["customer_id", "customer_name"]

# Creating DataFrames
orders_df = spark.createDataFrame(orders_data, orders_columns)
customers_df = spark.createDataFrame(customers_data, customers_columns)

# Step 1: Optimize Join with Broadcast (Avoid Shuffle)
optimized_join_df = orders_df.join(broadcast(customers_df), "customer_id")

# Step 2: Use Partitioning to Speed Up Queries
partitioned_df = orders_df.repartition("order_date")  # Partition by order_date for faster filtering

# Show Optimized Data
optimized_join_df.show()

#### ❗3. What strategies do you use to ensure data quality in your ETL processes?

In [0]:
# Sample Data with Quality Issues
data = [
    (1, "Alice", 25, "alice@example.com"),
    (2, "Bob", None, "bob@example.com"),  # Missing age
    (3, "Charlie", 200, "charlie@example.com"),  # Invalid age
    (4, "David", 35, "david@example.com"),
    (4, "David", 35, "david@example.com"),  # Duplicate record
]

columns = ["id", "name", "age", "email"]

# Creating DataFrame
df = spark.createDataFrame(data, columns)

# Step 1: Remove NULL values in critical columns
cleaned_df = df.filter(col("age").isNotNull())

# Step 2: Validate Age (Threshold: Age should be between 18-100)
validated_df = cleaned_df.withColumn(
    "age", when((col("age") < 18) | (col("age") > 100), lit(None)).otherwise(col("age"))
)

# Step 3: Deduplicate Data (Remove Duplicates)
deduplicated_df = validated_df.dropDuplicates()

# Step 4: Flagging Invalid Data (Missing Age)
flagged_df = deduplicated_df.withColumn(
    "data_quality_flag", when(col("age").isNull(), lit("Invalid Age")).otherwise(lit("Valid"))
)

# Show Data Quality Checked Data
flagged_df.show()


#### ❗4. How do you design scalable data architectures that can handle increasing data volumes? Consider a simple example and impliment practically?

In [0]:
# Sample Raw JSON Data (Simulating Data Lake Storage)
json_data = [
    ('{"timestamp":"2024-02-25 12:00:01", "user":"Alice", "event":"login", "location":"US"}',),
    ('{"timestamp":"2024-02-25 12:05:10", "user":"Bob", "event":"purchase", "amount": 250.50, "location":"UK"}',),
    ('{"timestamp":"2024-02-25 12:10:20", "user":"Charlie", "event":"logout", "location":"IN"}',)
]

columns = ["raw_json"]

# Creating DataFrame
df = spark.createDataFrame(json_data, columns)

# Step 1: Extract Key Fields from JSON (Schema Normalization)
structured_df = df.select(
    json_tuple(col("raw_json"), "timestamp", "user", "event", "amount", "location") \
    .alias("timestamp", "user", "event", "amount", "location")
)

# Step 2: Convert Data Types for Analysis
structured_df = structured_df.withColumn("amount", col("amount").cast("double"))

# Step 3: Store in a Scalable Format (Parquet for Efficient Querying)
structured_df.write.mode("overwrite").parquet("/mnt/data/scalable_data.parquet")

# Step 4: Read & Analyze Data from Scalable Storage
optimized_df = spark.read.parquet("/mnt/data/scalable_data.parquet")

# Show Scalable Data Output
optimized_df.show()

#### ❗5. How do you implement data lineage tracking in your data workflows?

In [0]:
# Sample Data (Simulating Raw Ingestion from a Source File)
raw_data = [
    (1, "Alice", 25, "alice@example.com"),
    (2, "Bob", 30, "bob@example.com"),
    (3, "Charlie", None, "charlie@example.com"),  # Missing age
]

columns = ["id", "name", "age", "email"]

# Creating DataFrame (Raw Data)
raw_df = spark.createDataFrame(raw_data, columns) \
    .withColumn("source_file", lit("s3://data-lake/raw/customers.csv")) \
    .withColumn("ingested_at", current_timestamp())  # Capture source details

# Step 1: Data Cleaning (Remove NULLs)
cleaned_df = raw_df.filter(col("age").isNotNull()) \
    .withColumn("transformation", lit("Removed NULLs in age column")) \
    .withColumn("processed_at", current_timestamp())

# Step 2: Hashing Emails for Privacy (Data Masking)
masked_df = cleaned_df.withColumn("email", lit("[MASKED]")) \
    .withColumn("transformation", lit("Masked email column"))

# Step 3: Store Lineage Metadata in Audit Table
lineage_df = masked_df.select(
    "id", "name", "age", "email", "source_file", "ingested_at", "processed_at", "transformation"
)

# Show Data Lineage Table
lineage_df.show()

#### ❗6. What factors do you consider when choosing an ETL tool for a project?

In [0]:
# Sample Data (Simulating Extraction from MySQL)
mysql_data = [
    (1, "Alice", 25, "alice@example.com"),
    (2, "Bob", None, "bob@example.com"),  # Missing age
    (3, "Charlie", 40, "charlie@example.com"),
]

columns = ["id", "name", "age", "email"]

# Creating DataFrame (Extracted Data)
df = spark.createDataFrame(mysql_data, columns)

# Step 1: Transformation - Remove NULL values in 'age' column
transformed_df = df.filter(col("age").isNotNull())

# Step 2: Write Processed Data to S3 (Simulating Load)
s3_path = "s3://abduldbtlearn/processed-data/customers.parquet"
transformed_df.write.mode("overwrite").parquet(s3_path)

print(f"✅ Data successfully saved to {s3_path}")


#### ❗7. How do you handle late-arriving data in your processing pipelines?

In [0]:
from pyspark.sql.types import *

# Define Schema for Event Data (String for event_time, will be converted later)
schema = StructType([
    StructField("event_id", StringType(), False),
    StructField("event_type", StringType(), False),
    StructField("event_time", StringType(), False)  # Keep as String, convert later
])

# Sample Late-Arriving Event Data
event_data = [
    ("E101", "login", "2024-02-25 12:00:01"),  # On-time event
    ("E102", "purchase", "2024-02-25 12:00:10"),  # On-time event
    ("E103", "logout", "2024-02-25 12:05:00"),  # Late event (arriving after the expected window)
]

# Creating DataFrame (Keep event_time as String initially)
df = spark.createDataFrame(event_data, schema=schema)

# Step 1: Convert `event_time` from String to Timestamp
df = df.withColumn("event_time", to_timestamp(col("event_time"), "yyyy-MM-dd HH:mm:ss"))

# Step 2: Assigning Current Processing Time (Simulating Late Arrival)
df = df.withColumn("processing_time", current_timestamp())

# Step 3: Windowing & Watermarking to Handle Late Events
windowed_df = df.withWatermark("event_time", "5 minutes") \
    .groupBy(window(col("event_time"), "5 minutes"), col("event_type")) \
    .count()

# Show Late-Arriving Data Processing Output
windowed_df.show()

#### ❗8. Can you explain your approach to partitioning data in storage systems?

In [0]:
# Sample Orders Data
orders_data = [
    (1, "Alice", "2024-02-25", 500),
    (2, "Bob", "2024-02-24", 300),
    (3, "Charlie", "2024-02-24", 700),
    (4, "David", "2024-02-23", 150),
]

columns = ["order_id", "customer_name", "order_date", "amount"]

# Creating DataFrame
df = spark.createDataFrame(orders_data, columns)

# Step 1: Partition Data by Order Date
partitioned_path = "s3://abduldbtlearn/partitioned-orders/"
df.write.mode("overwrite").partitionBy("order_date").parquet(partitioned_path)

print(f"✅ Data successfully partitioned and saved to {partitioned_path}")

# Step 2: Read Partitioned Data for Query Optimization
partitioned_df = spark.read.parquet(partitioned_path)

# Show Partitioned Data
partitioned_df.show()

#### ❗9. How do you manage and utilize metadata in your data engineering projects?

In [0]:
from datetime import datetime 

# Sample Orders Data
orders_data = [
    (1, "Alice", "2024-02-25", 500),
    (2, "Bob", "2024-02-24", 300),
    (3, "Charlie", "2024-02-23", 700),
]

columns = ["order_id", "customer_name", "order_date", "amount"]

# Creating DataFrame
df = spark.createDataFrame(orders_data, columns)

# Step 1: Define Schema for Metadata
metadata_schema = StructType([
    StructField("table_name", StringType(), False),
    StructField("schema", StringType(), False),
    StructField("row_count", IntegerType(), False),
    StructField("storage_location", StringType(), False),
    StructField("extracted_at", TimestampType(), False)  # Explicitly define timestamp type
])

# Step 2: Extract Metadata (Use Python `datetime.now()` instead of `current_timestamp()`)
metadata_values = [(
    "orders",
    str(df.schema),
    df.count(),
    "s3://your-bucket/orders.parquet",
    datetime.now()  # Use Python's datetime.now() instead of Spark's current_timestamp()
)]

# Step 3: Create Metadata DataFrame
metadata_df = spark.createDataFrame(metadata_values, schema=metadata_schema)

# Step 4: Add Spark `current_timestamp()` for real-time tracking
metadata_df = metadata_df.withColumn("processed_at", current_timestamp())


display(metadata_df)


#### ❗10. Describe a scenario where you integrated data from multiple disparate sources. What challenges did you face, and how did you overcome them?

In [0]:
# Step 1: Extract Data from MySQL (Simulated)
mysql_data = [
    (1, "Alice", 25, "alice@example.com"),
    (2, "Bob", 30, "bob@example.com")
]
mysql_columns = ["id", "name", "age", "email"]
mysql_df = spark.createDataFrame(mysql_data, mysql_columns).withColumn("source", lit("MySQL"))

# Step 2: Read JSON Data from AWS S3 (Simulated)
s3_data = [
    (3, "Charlie", 28, "charlie@example.com"),
    (4, "David", 35, "david@example.com")
]
s3_columns = ["id", "name", "age", "email"]
s3_df = spark.createDataFrame(s3_data, s3_columns).withColumn("source", lit("AWS S3"))

# Step 3: Simulate Real-Time Kafka Events (User Logs)
kafka_data = [
    (5, "Eve", 27, "eve@example.com"),
    (6, "Frank", 32, "frank@example.com")
]
kafka_columns = ["id", "name", "age", "email"]
kafka_df = spark.createDataFrame(kafka_data, kafka_columns).withColumn("source", lit("Kafka"))

# Step 4: Merge All Data Sources into a Unified Dataset
merged_df = mysql_df.union(s3_df).union(kafka_df)

# Step 5: Remove Duplicates Based on ID
cleaned_df = merged_df.dropDuplicates(["id"])

# Show Integrated Data
cleaned_df.show()
