E-Commerce Transactions + Returns + Inventory

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                    .appName("E_Commerce_Transactions") \
                    .getOrCreate()
spark

#### PySpark + Delta

Loading Data

In [3]:
order_df = spark.read.format("csv") \
                     .option("inferschema", "true") \
                     .option("header", "true") \
                     .load("/content/orders.csv")

customer_df = spark.read.format("csv") \
                     .option("inferschema", "true") \
                     .option("header", "true") \
                     .load("/content/customers.csv")

product_df = spark.read.format("csv") \
                     .option("inferschema", "true") \
                     .option("header", "true") \
                     .load("/content/products.csv")


Removing tailing space

In [4]:
def strip_column_names(df):
    cleaned_columns = []
    for col_name in df.columns:
        cleaned_columns.append(col_name.strip())
    return df.toDF(*cleaned_columns)

order_df = strip_column_names(order_df)
customer_df = strip_column_names(customer_df)
product_df = strip_column_names(product_df)

1. Ingest all 3 CSVs as Delta Tables

In [5]:
spark.sql("CREATE DATABASE IF NOT EXISTS ECommerce")
spark.sql("USE ECommerce")

DataFrame[]

In [None]:
order_df.write.format("delta") \
              .mode("overwrite") \
              .saveAsTable("ECommerce.orders")

customer_df.write.format("delta") \
              .mode("overwrite") \
              .saveAsTable("ECommerce.customers")

product_df.write.format("delta") \
              .mode("overwrite") \
              .saveAsTable("ECommerce.products")

2.  Write SQL to get the total revenue per Product.

In [7]:
order_df.createOrReplaceTempView("orders")
customer_df.createOrReplaceTempView("customers")
product_df.createOrReplaceTempView("products")

In [8]:
spark.sql("""
            SELECT
                ProductID,
                SUM(Quantity * Price) AS TotalRevenue
            FROM orders
            GROUP BY ProductID
          """).show()

+---------+------------+
|ProductID|TotalRevenue|
+---------+------------+
|    P1001|       75000|
|    P1002|      150000|
|    P1004|       30000|
|    P1003|       30000|
+---------+------------+



3. Join Orders + Customers to find revenue by Region.

In [9]:
from pyspark.sql.functions import col, sum
order_df = order_df.join(customer_df, on=customer_df.CustomerID == order_df.CustomerID, how="inner") \
                   .drop(customer_df.CustomerID)

order_df.show()

+-------+----------+---------+--------+-----+----------+----------+------------+------+-------------------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|    Status|CustomerName|Region|         SignupDate|
+-------+----------+---------+--------+-----+----------+----------+------------+------+-------------------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered |        Amit| North|2023-11-12 00:00:00|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned |        Sara| South|2024-01-08 00:00:00|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered |        John|  West|2023-06-20 00:00:00|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered |        Amit| North|2023-11-12 00:00:00|
|   3005|      C004|    P1004|       3|10000|2024-05-05|   Pending|       Priya|  East|2024-03-15 00:00:00|
+-------+----------+---------+--------+-----+----------+----------+------------+------+-------------------+



In [10]:
order_df.groupBy('Region') \
         .agg(sum(col('Price') * col('Quantity')).alias('TotalRevenue')) \
         .show()

+------+------------+
|Region|TotalRevenue|
+------+------------+
| South|      100000|
|  East|       30000|
|  West|       30000|
| North|      125000|
+------+------------+



4. Update the Status of Pending orders to 'Cancelled'.

In [11]:
from pyspark.sql.functions import when
order_df = order_df.withColumn('Status',
                               when(col('Status') == 'Pending', 'Cancelled')
                               .otherwise(col('Status')))

order_df.select(
                    'CustomerID',
                    'OrderID',
                    'ProductID',
                    'OrderDate',
                    'Status'
).show()

+----------+-------+---------+----------+----------+
|CustomerID|OrderID|ProductID| OrderDate|    Status|
+----------+-------+---------+----------+----------+
|      C001|   3001|    P1001|2024-05-01|Delivered |
|      C002|   3002|    P1002|2024-05-02| Returned |
|      C003|   3003|    P1003|2024-05-03|Delivered |
|      C001|   3004|    P1002|2024-05-04|Delivered |
|      C004|   3005|    P1004|2024-05-05| Cancelled|
+----------+-------+---------+----------+----------+



#### DLT Pipeline

6. Create raw → cleaned → aggregated tables: \
 Clean: Remove rows with NULLs \
 Aggregated: Total revenue per Category

In [None]:
import dlt
from pyspark.sql.functions import col, sum as _sum

# loading raw data
@dlt.table(
  comment="Raw orders"
)

def raw_orders():
    return spark.read.format("delta").load("shared.default.orders")

# cleaning the data
@dlt.table(
  comment="Cleaned orders with no NULLs"
)
def cleaned_orders():
    return dlt.read("raw_orders").na.drop()

#total revenue per category
@dlt.table(
  comment="Total revenue per product category"
)
def revenue_per_category():
    orders = dlt.read("cleaned_orders")

    return orders.groupBy("Category") \
                 .agg(_sum(col("Quantity") * col("Price")).alias("TotalRevenue"))


#### Time Travel

7. View data before the Status update.


In [None]:
previous_version_df = spark.read.format("delta") \
    .option("versionAsOf", 0) \
    .table("shared.default.orders")

previous_version_df.show()

+-------+----------+---------+--------+-----+----------+----------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|    Status|
+-------+----------+---------+--------+-----+----------+----------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered |
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned |
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered |
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered |
|   3005|      C004|    P1004|       3|10000|2024-05-05|   Pending|
+-------+----------+---------+--------+-----+----------+----------+



In [None]:
from delta.tables import DeltaTable

delta_table_path = "/mnt/delta/shared/default/orders"

delta_table = DeltaTable.forPath(spark, delta_table_path)

delta_table.restoreToVersion(0)

#### Vacuum + Retention


9. Run VACUUM after changing default retention.

In [None]:
# Disable retention check
spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled = false")

# Run VACUUM with 1 hour retention (or any value you want)
spark.sql("VACUUM shared.default.orders RETAIN 1 HOURS")

# Re-enable retention check
spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled = true")

Expectations

10.
Quantity > 0 ,
price > O
orderDate is not null

In [None]:

order_df.filter(
                (col('Quantity') > 0) &
                (col('Price') > 0) &
                (col('OrderDate').isNotNull())
).show()

+-------+----------+---------+--------+-----+----------+----------+------------+------+-------------------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|    Status|CustomerName|Region|         SignupDate|
+-------+----------+---------+--------+-----+----------+----------+------------+------+-------------------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered |        Amit| North|2023-11-12 00:00:00|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned |        Sara| South|2024-01-08 00:00:00|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered |        John|  West|2023-06-20 00:00:00|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered |        Amit| North|2023-11-12 00:00:00|
|   3005|      C004|    P1004|       3|10000|2024-05-05| Cancelled|       Priya|  East|2024-03-15 00:00:00|
+-------+----------+---------+--------+-----+----------+----------+------------+------+-------------------+



#### Bonus

11. Use when-otherwise to create a new column: \
Order Type == 'Return'
if Status = 'Returned'

In [None]:
order_df = order_df.withColumn('OrderType',
                                when(col('Status') == 'Returned', 'Return')
                                .otherwise(col('Status'))
)

order_df.select(
                    'OrderID',
                    'CustomerID',
                    'ProductID',
                    'Quantity',
                    'Price',
                    'OrderDate',
                    'Status',
                    'OrderType'
).show()

+-------+----------+---------+--------+-----+----------+----------+----------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|    Status| OrderType|
+-------+----------+---------+--------+-----+----------+----------+----------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered |Delivered |
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned | Returned |
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered |Delivered |
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered |Delivered |
|   3005|      C004|    P1004|       3|10000|2024-05-05| Cancelled| Cancelled|
+-------+----------+---------+--------+-----+----------+----------+----------+

