In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

# =============================================================================
# SPARK SESSION INITIALIZATION
# =============================================================================

spark = SparkSession.builder \
    .appName("COMS_Project_Docker") \
    .master("spark://coms-spark-master:7077") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://coms-minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minio_user") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio_password") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config(
        "spark.jars.packages",
        "org.apache.hadoop:hadoop-aws:3.3.6,"
        "org.apache.hadoop:hadoop-client:3.3.6,"
        "com.amazonaws:aws-java-sdk-bundle:1.12.367"
    ) \
    .getOrCreate()

print("SparkSession created and connected to MinIO!")

SparkSession created and connected to MinIO!


---
# Data Verification
Raw input files will be stored in the `/raw/` directory as CSV files.

In [20]:
# Define the base path
raw_base_path = "s3a://raw"

# Reading CSV files into DataFrames
print("Reading customers_csv...")
raw_customers_df = spark.read.csv(
    f"{raw_base_path}/customers_csv.csv",
    header=True
)

print("Reading orders_csv...")
raw_orders_df = spark.read.csv(
    f"{raw_base_path}/orders_csv.csv",
    header=True
)

print("Reading order_items_csv...")
raw_order_items_df = spark.read.csv(
    f"{raw_base_path}/order_items_csv.csv",
    header=True
)

print("Reading payments_csv...")
raw_payments_df = spark.read.csv(
    f"{raw_base_path}/payments_csv.csv",
    header=True
)

print("--- Verification of Raw DataFrames ---")

print("Schema and preview for 'raw_customers_df':")
raw_customers_df.printSchema()
raw_customers_df.show(5)

print("Schema and preview for 'raw_orders_df':")
raw_orders_df.printSchema()
raw_orders_df.show(5)

print("Schema and preview for 'raw_order_items_df':")
raw_order_items_df.printSchema()
raw_order_items_df.show(5)

print("Schema and preview for 'raw_payments_df':")
raw_payments_df.printSchema()
raw_payments_df.show(5)

Reading customers_csv...
Reading orders_csv...
Reading order_items_csv...
Reading payments_csv...
--- Verification of Raw DataFrames ---
Schema and preview for 'raw_customers_df':
root
 |-- customer_id: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- signup_date: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- region: string (nullable = true)

+-----------+----------+--------------------+-----------+----------+------+
|customer_id| full_name|               email|signup_date|     phone|region|
+-----------+----------+--------------------+-----------+----------+------+
|   CUST1000|Customer 0|customer0@example...| 2025-04-12|0900770487| North|
|   CUST1001|Customer 1|customer1@example...| 2025-04-13|0900216739|  West|
|   CUST1002|Customer 2|customer2@example...| 2025-04-14|0900126225| North|
|   CUST1003|Customer 3|customer3@example...| 2025-04-15|0900877572| North|
|   CUST1004|Customer 4|customer4@example..

---
# Raw → Processed Zone
- Read and normalize CSVs into structured DataFrames.
- Convert all dates into consistent timestamp format.
- Deduplicate based on primary keys (e.g., order_id, order_item_id).
- Filter out invalid records:
  - Orders with total_amount <= 0
  - Payments with status = 'failed' or 'cancelled'

In [21]:
# =============================================================================
# DEFINE PATHS & SCHEMAS
# =============================================================================
raw_base_path = "s3a://raw"
processed_base_path = "s3a://processed"

customers_schema = StructType([
    StructField("customer_id", StringType(), False),
    StructField("full_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("signup_date", DateType(), True),
    StructField("phone", StringType(), True),
    StructField("region", StringType(), True)
])

orders_schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("customer_id", StringType(), True),
    StructField("order_date", DateType(), True),
    StructField("status", StringType(), True),
    StructField("channel", StringType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("currency", StringType(), True)
])

order_items_schema = StructType([
    StructField("order_item_id", StringType(), False),
    StructField("order_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price_per_unit", DoubleType(), True),
    StructField("discount", DoubleType(), True)
])

payments_schema = StructType([
    StructField("payment_id", StringType(), False),
    StructField("order_id", StringType(), True),
    StructField("payment_date", DateType(), True),
    StructField("amount", DoubleType(), True),
    StructField("payment_method", StringType(), True),
    StructField("payment_status", StringType(), True)
])


# =============================================================================
# PRE-PROCESSING
# =============================================================================

# def process_table(table_name, schema, primary_key, date_columns=[], filter_condition=None):
#     """
#     Generic function to read, clean, and write a table.
#     """
#     try:
#         print(f"Processing table: {table_name}...")
        
#         # Read from raw zone
#         input_path = f"{raw_base_path}/{table_name}.csv"
#         df = spark.read.csv(input_path, header=True, schema=schema)
        
#         # Convert date columns to timestamp format
#         for date_col in date_columns:
#             df = df.withColumn(date_col, to_timestamp(col(date_col)))
        
#         # Apply filter condition if provided
#         if filter_condition is not None:
#             df = df.filter(filter_condition)
            
#         # Deduplicate based on primary key
#         df = df.dropDuplicates([primary_key])

#     except Exception as e:
#         print(f"Error processing table {table_name}: {e}")

def read_table(table_name, schema, primary_key, date_columns=[]):
    """
    - Read and normalize CSVs into structured DataFrames.
    - Convert all dates into consistent timestamp format.
    - Deduplicate based on primary keys (e.g., order_id, order_item_id).
    """
    try:
        print(f"Processing table: {table_name}...")
        
        # Read from raw zone
        input_path = f"{raw_base_path}/{table_name}.csv"
        df = spark.read.csv(input_path, header=True, schema=schema)
        
        # Convert date columns to timestamp format
        for date_col in date_columns:
            df = df.withColumn(date_col, to_timestamp(col(date_col)))
        
        # Deduplicate based on primary key
        df = df.dropDuplicates([primary_key])

        return df
    
    except Exception as e:
        print(f"Error processing table {table_name}: {e}")
        return None

def write_table(table_name, df):
    try:
        print(f"Writing table: {table_name}...")
        
        # Write to processed zone in CSV format with headers
        output_path = f"{processed_base_path}/{table_name}"
        df.write.mode("overwrite").parquet(output_path)
        # df.write.mode("overwrite").option("header", "true").option("timestampFormat", "yyyy-MM-dd HH:mm:ss").csv(output_path)
        
        print(f"Successfully processed and saved '{table_name}' as CSV to '{output_path}'.")
        
        # For verification, show a few rows
        df.printSchema()
        df.show(5, truncate=False)
        
    except Exception as e:
        print(f"Error writing table {table_name}: {e}")

### customers_csv
processed_customers_df = read_table(table_name="customers_csv", 
                                    schema=customers_schema, 
                                    primary_key="customer_id", 
                                    date_columns=["signup_date"])
if processed_customers_df:
    write_table("customers_csv", processed_customers_df)


### orders_csv 
processed_orders_df = read_table(table_name="orders_csv",
                                 schema=orders_schema, 
                                 primary_key="order_id", 
                                 date_columns=["order_date"])
# processed_orders_df.filter(~processed_orders_df.name.isin(["Alice", "Charlie"])).show()
if processed_orders_df:
    write_df = processed_orders_df.filter("total_amount > 0")
    write_table("orders_csv", write_df)

### order_items_csv
processed_order_items_df = read_table(table_name="order_items_csv",
                                      schema=order_items_schema,
                                      primary_key="order_item_id")
if processed_order_items_df:
    write_table("order_items_csv", processed_order_items_df)

### payments_csv
processed_payments_df = read_table(table_name="payments_csv", 
                                   schema=payments_schema, 
                                   primary_key="payment_id", 
                                   date_columns=["payment_date"])
if processed_payments_df:
    write_df = processed_payments_df.filter(~processed_payments_df.payment_status.isin(["failed", "cancelled"]))
    write_table("payments_csv", write_df)


# Process each table according to the requirements
# process_table(
#     table_name="customers_csv",
#     schema=customers_schema,
#     primary_key="customer_id",
#     date_columns=["signup_date"]
# )

# process_table(
#     table_name="orders_csv",
#     schema=orders_schema,
#     primary_key="order_id",
#     date_columns=["order_date"],
#     filter_condition="total_amount > 0"  # Filter out orders with total_amount <= 0
# )

# process_table(
#     table_name="order_items_csv",
#     schema=order_items_schema,
#     primary_key="order_item_id"
# )

# process_table(
#     table_name="payments_csv",
#     schema=payments_schema,
#     primary_key="payment_id",
#     date_columns=["payment_date"],
#     filter_condition=~col("payment_status").isin(["failed", "cancelled"]) # Filter out failed or cancelled payments
# )

Processing table: customers_csv...
Writing table: customers_csv...
Successfully processed and saved 'customers_csv' as CSV to 's3a://processed/customers_csv'.
root
 |-- customer_id: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- signup_date: timestamp (nullable = true)
 |-- phone: string (nullable = true)
 |-- region: string (nullable = true)

+-----------+----------+---------------------+-------------------+----------+------+
|customer_id|full_name |email                |signup_date        |phone     |region|
+-----------+----------+---------------------+-------------------+----------+------+
|CUST1000   |Customer 0|customer0@example.com|2025-04-12 00:00:00|0900770487|North |
|CUST1001   |Customer 1|customer1@example.com|2025-04-13 00:00:00|0900216739|West  |
|CUST1002   |Customer 2|customer2@example.com|2025-04-14 00:00:00|0900126225|North |
|CUST1003   |Customer 3|customer3@example.com|2025-04-15 00:00:00|0900877572|North 

In [22]:
processed_base_path = "s3a://processed"

# --- Reading the 'customers' dataset ---
# Point Spark to the PARENT DIRECTORY. Spark handles the part-files automatically.
# print("Loading processed customers data...")
# customers_df = spark.read \
#     .option("header", "true") \
#     .schema(customers_schema) \
#     .csv(f"{processed_base_path}/customers_csv") # Note the path is to the directory

# # --- Reading the 'orders' dataset ---
# print("Loading processed orders data...")
# orders_df = spark.read \
#     .option("header", "true") \
#     .schema(orders_schema) \
#     .csv(f"{processed_base_path}/orders_csv")

# # --- Reading the 'order_items' dataset ---
# print("Loading processed order_items data...")
# order_items_df = spark.read \
#     .option("header", "true") \
#     .schema(order_items_schema) \
#     .csv(f"{processed_base_path}/order_items_csv")

# # --- Reading the 'payments' dataset ---
# print("Loading processed order_items data...")
# payments_df = spark.read \
#     .option("header", "true") \
#     .schema(payments_schema) \
#     .csv(f"{processed_base_path}/payments_csv")

customers_df = spark.read.parquet(f"{processed_base_path}/customers_csv")
orders_df = spark.read.parquet(f"{processed_base_path}/orders_csv")
order_items_df = spark.read.parquet(f"{processed_base_path}/order_items_csv")
payments_df = spark.read.parquet(f"{processed_base_path}/payments_csv")

# Data validate
print("Loaded processed customers data:")
customers_df.printSchema()
customers_df.show()

print("Loaded processed orders data:")
orders_df.printSchema()
orders_df.filter("total_amount <= 0").show()

print("Loaded processed order items data:")
order_items_df.printSchema()
order_items_df.show()

print("Loaded processed payments data:")
payments_df.printSchema()
payments_df.filter(payments_df.payment_status.isin(["failed", "cancelled"])).show()

Loaded processed customers data:
root
 |-- customer_id: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- signup_date: timestamp (nullable = true)
 |-- phone: string (nullable = true)
 |-- region: string (nullable = true)

+-----------+----------+--------------------+-------------------+----------+------+
|customer_id| full_name|               email|        signup_date|     phone|region|
+-----------+----------+--------------------+-------------------+----------+------+
|   CUST1000|Customer 0|customer0@example...|2025-04-12 00:00:00|0900770487| North|
|   CUST1001|Customer 1|customer1@example...|2025-04-13 00:00:00|0900216739|  West|
|   CUST1002|Customer 2|customer2@example...|2025-04-14 00:00:00|0900126225| North|
|   CUST1003|Customer 3|customer3@example...|2025-04-15 00:00:00|0900877572| North|
|   CUST1004|Customer 4|customer4@example...|2025-04-16 00:00:00|0900388389| North|
|   CUST1005|Customer 5|customer5@example...|20

---
# Processed → Curated Zone
Generate the following curated datasets:

## `customer_orders_summary`
- Total number of orders per customer
- Total amount spent
- Average order value
- First and last order dates
- Customer active status (last order within 90 days)

## `order_facts`
- Join orders, items, payments
- Compute net revenue = (quantity × price - discount)
- Enrich with customer and region info

## `daily_sales_aggregates`
- Group by order_date, region, and channel
- Metrics:
  - Total sales
  - Order count
  - Unique customers
  - Most used payment method

In [23]:
from pyspark.sql.functions import (
    col, count, sum, avg, min, max, date_sub, current_date, when, lit,
    row_number, rank, first
)
from pyspark.sql.window import Window

# =============================================================================
# LOAD PROCESSED (SILVER) DATA
# =============================================================================

processed_base_path = "s3a://processed"
curated_base_path = "s3a://curated"

# Load the four processed tables
customers_df = spark.read.parquet(f"{processed_base_path}/customers_csv")
orders_df = spark.read.parquet(f"{processed_base_path}/orders_csv")
order_items_df = spark.read.parquet(f"{processed_base_path}/order_items_csv")
payments_df = spark.read.parquet(f"{processed_base_path}/payments_csv")

# customers_df = spark.read \
#     .option("header", "true") \
#     .schema(customers_schema) \
#     .csv(f"{processed_base_path}/customers_csv")

# orders_df = spark.read \
#     .option("header", "true") \
#     .schema(customers_schema) \
#     .csv(f"{processed_base_path}/orders_csv")

# order_items_df = spark.read \
#     .option("header", "true") \
#     .schema(customers_schema) \
#     .csv(f"{processed_base_path}/order_items_csv")

# payments_df = spark.read \
#     .option("header", "true") \
#     .schema(customers_schema) \
#     .csv(f"{processed_base_path}/payments_csv")

print("All processed tables loaded.")


# =============================================================================
# CREATE customer_orders_summary
# =============================================================================
print("--- Creating customer_orders_summary ---")

customer_orders_summary = orders_df.groupBy("customer_id").agg(
    count("order_id").alias("total_orders"),
    sum("total_amount").alias("total_amount_spent"),
    avg("total_amount").alias("average_order_value"),
    min("order_date").alias("first_order_date"),
    max("order_date").alias("last_order_date")
).withColumn(
    "active_status",
    when(col("last_order_date") >= date_sub(current_date(), 90), lit("active"))
    .otherwise(lit("inactive"))
)

# Write to the curated zone
customer_orders_summary.write.mode("overwrite").parquet(f"{curated_base_path}/customer_orders_summary")

print("Created and saved customer_orders_summary.")
customer_orders_summary.show(5, truncate=False)


# =============================================================================
# CREATE order_facts
# =============================================================================
print("--- Creating order_facts ---")

# Join orders, items, and customer info
order_facts = order_items_df.join(
    orders_df,
    order_items_df.order_id == orders_df.order_id,
    "inner"
).join(
    customers_df,
    orders_df.customer_id == customers_df.customer_id,
    "inner"
).withColumn(
    "net_revenue",
    (col("quantity") * col("price_per_unit")) - col("discount")
).select(
    orders_df["order_date"],
    orders_df["order_id"],
    order_items_df["order_item_id"],
    customers_df["customer_id"],
    customers_df["full_name"].alias("customer_name"),
    customers_df["region"],
    order_items_df["product_id"],
    order_items_df["product_name"],
    order_items_df["category"],
    order_items_df["quantity"],
    order_items_df["price_per_unit"],
    order_items_df["discount"],
    "net_revenue",
    orders_df["channel"]
)

# Write to the curated zone, partitioned by order_date
order_facts.write.mode("overwrite").partitionBy("order_date").parquet(f"{curated_base_path}/order_facts")

print("Created and saved order_facts.")
order_facts.show(5, truncate=False)

All processed tables loaded.
--- Creating customer_orders_summary ---
Created and saved customer_orders_summary.
+-----------+------------+------------------+-------------------+-------------------+-------------------+-------------+
|customer_id|total_orders|total_amount_spent|average_order_value|first_order_date   |last_order_date    |active_status|
+-----------+------------+------------------+-------------------+-------------------+-------------------+-------------+
|CUST1004   |6           |1926.18           |321.03000000000003 |2025-03-26 00:00:00|2025-04-19 00:00:00|inactive     |
|CUST1009   |10          |2471.38           |247.138            |2025-03-23 00:00:00|2025-04-18 00:00:00|inactive     |
|CUST1006   |8           |1306.81           |163.35125          |2025-03-23 00:00:00|2025-04-18 00:00:00|inactive     |
|CUST1005   |1           |231.23            |231.23             |2025-04-06 00:00:00|2025-04-06 00:00:00|inactive     |
|CUST1007   |4           |1318.61           |32

In [24]:
from pyspark.sql.functions import countDistinct

# =============================================================================
# CREATE daily_sales_aggregates
# =============================================================================
print("--- Creating daily_sales_aggregates ---")

# Join the tables
orders_payments_customers_df = orders_df.join(customers_df, "customer_id", "inner") \
                                        .join(payments_df, "order_id", "inner")

# rank payment methods within each group
window_spec = Window.partitionBy("order_date", "region", "channel").orderBy(col("payment_method_count").desc())

# count each payment method per group and rank them
most_used_payment_method_df = orders_payments_customers_df.groupBy("order_date", "region", "channel", "payment_method") \
    .count().withColumnRenamed("count", "payment_method_count") \
    .withColumn("rank", rank().over(window_spec)) \
    .filter(col("rank") == 1) \
    .select(
        col("order_date").alias("mu_order_date"),
        col("region").alias("mu_region"),
        col("channel").alias("mu_channel"),
        col("payment_method").alias("most_used_payment_method")
    )

daily_aggregates_df = orders_payments_customers_df.groupBy("order_date", "region", "channel").agg(
    sum("total_amount").alias("total_sales"),
    countDistinct("order_id").alias("order_count"),
    countDistinct("customer_id").alias("unique_customers")
)

# join the aggregates with the most used payment method
daily_sales_aggregates = daily_aggregates_df.join(
    most_used_payment_method_df,
    (daily_aggregates_df.order_date == most_used_payment_method_df.mu_order_date) &
    (daily_aggregates_df.region == most_used_payment_method_df.mu_region) &
    (daily_aggregates_df.channel == most_used_payment_method_df.mu_channel),
    "inner"
).select(
    "order_date",
    "region",
    "channel",
    "total_sales",
    "order_count",
    "unique_customers",
    "most_used_payment_method"
)

# write to the curated zone
daily_sales_aggregates.write.mode("overwrite").partitionBy("order_date").parquet(f"{curated_base_path}/daily_sales_aggregates")

print("Created and saved daily_sales_aggregates.")
daily_sales_aggregates.show(10, truncate=False)

--- Creating daily_sales_aggregates ---
Created and saved daily_sales_aggregates.
+-------------------+------+-------+-----------+-----------+----------------+------------------------+
|order_date         |region|channel|total_sales|order_count|unique_customers|most_used_payment_method|
+-------------------+------+-------+-----------+-----------+----------------+------------------------+
|2025-04-14 00:00:00|North |online |90.67      |1          |1               |cash                    |
|2025-04-19 00:00:00|North |retail |74.13      |1          |1               |paypal                  |
|2025-04-06 00:00:00|North |retail |205.51     |1          |1               |credit_card             |
|2025-03-30 00:00:00|South |retail |80.01      |1          |1               |credit_card             |
|2025-04-03 00:00:00|South |mobile |168.84     |1          |1               |paypal                  |
|2025-04-12 00:00:00|South |retail |236.69     |1          |1               |paypal           

---
# Advanced Features (Optional)

- Use window functions to rank:
  - Top 3 customers by revenue in each region
  - First-time buyers this week
- Add alert for delayed payments (>2 days after order)
- Apply SCD Type 2 tracking on customer dimension

In [25]:
from pyspark.sql.functions import rank
from pyspark.sql.window import Window

print("--- Top 3 Customers by Revenue per Region ---")

# Calculate total revenue per customer from the order_facts table
customer_revenue = order_facts.groupBy("customer_id", "customer_name", "region") \
    .agg(sum("net_revenue").alias("total_revenue"))

# Define the window to partition by region and order by revenue
window_spec = Window.partitionBy("region").orderBy(col("total_revenue").desc())

# Rank customers within each region
ranked_customers = customer_revenue.withColumn("rank", rank().over(window_spec))

# Filter to get only the top 3 in each region
top_3_customers_by_region = ranked_customers.filter(col("rank") <= 3)

print("Top 3 Customers by Revenue in Each Region:")
top_3_customers_by_region.show(truncate=False)

--- Top 3 Customers by Revenue per Region ---
Top 3 Customers by Revenue in Each Region:
+-----------+-------------+------+------------------+----+
|customer_id|customer_name|region|total_revenue     |rank|
+-----------+-------------+------+------------------+----+
|CUST1003   |Customer 3   |North |2296.3300000000004|1   |
|CUST1004   |Customer 4   |North |2209.3700000000003|2   |
|CUST1002   |Customer 2   |North |2141.23           |3   |
|CUST1008   |Customer 8   |South |2514.05           |1   |
|CUST1006   |Customer 6   |South |2156.1599999999994|2   |
|CUST1005   |Customer 5   |South |345.43            |3   |
|CUST1009   |Customer 9   |West  |3516.83           |1   |
|CUST1001   |Customer 1   |West  |2865.44           |2   |
+-----------+-------------+------+------------------+----+



In [26]:
from pyspark.sql.functions import date_sub, current_date

print("--- Identifying First-Time Buyers This Week ---")

# Find the first order date for every customer
first_order_dates = orders_df.groupBy("customer_id") \
    .agg(min("order_date").alias("first_order_date"))

# Define the start of the week (7 days ago)
start_of_week = date_sub(current_date(), 7)

# Filter for customers has first order in thiz week
first_time_buyers_this_week = first_order_dates.filter(
    col("first_order_date") >= start_of_week
)

print(f"First-Time Buyers Since {start_of_week}:")
first_time_buyers_this_week.show()

--- Identifying First-Time Buyers This Week ---
First-Time Buyers Since Column<'date_sub(current_date(), 7)'>:
+-----------+----------------+
|customer_id|first_order_date|
+-----------+----------------+
+-----------+----------------+



In [27]:
from pyspark.sql.functions import datediff

print("--- Alert for Payments Delayed > 2 Days ---")

# Join orders and payments tables
order_payment_dates = orders_df.join(payments_df, "order_id", "inner")

# Calculate the difference in days
payment_delays = order_payment_dates.withColumn(
    "days_to_pay",
    datediff(col("payment_date"), col("order_date"))
)

# Filter for significant delays
delayed_payment_alerts = payment_delays.filter(col("days_to_pay") > 2) \
    .select("order_id", "customer_id", "order_date", "payment_date", "days_to_pay")

print("Alerts for Delayed Payments:")
delayed_payment_alerts.show()

--- Alert for Payments Delayed > 2 Days ---
Alerts for Delayed Payments:
+-----------+-----------+-------------------+-------------------+-----------+
|   order_id|customer_id|         order_date|       payment_date|days_to_pay|
+-----------+-----------+-------------------+-------------------+-----------+
|ORD11bdc76c|   CUST1006|2025-04-12 00:00:00|2025-04-15 00:00:00|          3|
|ORD1faef967|   CUST1009|2025-03-23 00:00:00|2025-03-26 00:00:00|          3|
|ORD3c63bceb|   CUST1001|2025-04-04 00:00:00|2025-04-07 00:00:00|          3|
|ORD469940d8|   CUST1008|2025-04-06 00:00:00|2025-04-09 00:00:00|          3|
|ORD5aa5e9d6|   CUST1009|2025-04-09 00:00:00|2025-04-12 00:00:00|          3|
|ORD75d04717|   CUST1008|2025-04-14 00:00:00|2025-04-17 00:00:00|          3|
|ORD9c9304e7|   CUST1000|2025-04-01 00:00:00|2025-04-04 00:00:00|          3|
|ORDdadea663|   CUST1006|2025-03-30 00:00:00|2025-04-02 00:00:00|          3|
+-----------+-----------+-------------------+-------------------+----

In [None]:
spark.stop()