In [0]:
%sql
create schema gold

In [0]:
%sql
use schema silver

In [0]:
%sql
select OrderID, sum(Sale) as Bill
from
(select o.OrderID, p.Product_ID, ot.Quantity, p.Discounted_Price , ot.Quantity*p.Discounted_Price as Sale
from orders o join
order_items ot on o.OrderID = ot.OrderID
join products p on ot.ProductID = p.Product_ID) X
group by OrderID

In [0]:
%sql
WITH OrderTotals AS (
    SELECT
        o.CustomerID,
        COUNT(DISTINCT o.OrderID) AS NumOrders,
        SUM(oi.Quantity) AS BasketSize,
        MIN(o.OrderDate) AS FirstOrderDate,
        MAX(o.OrderDate) AS LastOrderDate
    FROM
        orders o
        JOIN order_items oi ON o.OrderID = oi.OrderID
    GROUP BY
        o.CustomerID
),
CustomerAnalysis AS (
    SELECT
        ot.CustomerID,
        c.Email,
        ot.NumOrders,
        round(ot.BasketSize / ot.NumOrders,0) AS AvgBasketSize,
        round(DATEDIFF(ot.LastOrderDate, ot.FirstOrderDate) / NULLIF(ot.NumOrders - 1, 0),0) AS PurchaseFrequency
    FROM
        OrderTotals ot
        JOIN customers c ON ot.CustomerID = c.CustomerID
)
SELECT
    CustomerID,
    Email,
    NumOrders,
    AvgBasketSize,
    PurchaseFrequency
FROM
    CustomerAnalysis
ORDER BY
    NumOrders DESC;


In [0]:
from pyspark.sql.functions import col, countDistinct, sum, min, max, datediff, round

# Load data
orders = spark.table("orders")
order_items = spark.table("order_items")
customers = spark.table("customers")

# Compute Order Totals
order_totals = orders.join(order_items, orders.OrderID == order_items.OrderID) \
    .groupBy("CustomerID") \
    .agg(
        countDistinct("orders.OrderID").alias("NumOrders"),
        sum("order_items.Quantity").alias("BasketSize"),
        min("orders.OrderDate").alias("FirstOrderDate"),
        max("orders.OrderDate").alias("LastOrderDate")
    )

# Compute Customer Analysis
customer_analysis = order_totals.join(customers, "CustomerID") \
    .select(
        col("CustomerID"),
        col("Email"),
        col("NumOrders"),
        round(col("BasketSize") / col("NumOrders"), 0).alias("AvgBasketSize"),
        round(datediff(col("LastOrderDate"), col("FirstOrderDate")) / (col("NumOrders") - 1), 0).alias("PurchaseFrequency")
    )

# Sort and display results
display(customer_analysis.orderBy(col("NumOrders").desc()))

In [0]:
customer_analysis.write.format("delta").mode("overwrite").saveAsTable("gold.customer_analysis")

In [0]:
%sql
select * from gold.customer_analysis
order by Email

In [0]:
%sql
WITH VendorOrderTotals AS (
    SELECT
        o.VendorID,
        COUNT(DISTINCT o.OrderID) AS NumOrders,
        SUM(oi.Quantity) AS TotalItems,
        MIN(o.OrderDate) AS FirstOrderDate,
        MAX(o.OrderDate) AS LastOrderDate
    FROM
        Orders o
        JOIN Order_Items oi ON o.OrderID = oi.OrderID
    GROUP BY
        o.VendorID
),
VendorAnalysis AS (
    SELECT
        vot.VendorID,
        v.VendorName,
        vot.NumOrders,
        vot.TotalItems / vot.NumOrders AS AvgItemsPerOrder,
        DATEDIFF(vot.LastOrderDate, vot.FirstOrderDate) / NULLIF(vot.NumOrders - 1, 0) AS OrderFrequency
    FROM
        VendorOrderTotals vot
        JOIN Vendors v ON vot.VendorID = v.VendorID
)
SELECT
    VendorID,
    VendorName,
    NumOrders,
    AvgItemsPerOrder,
    OrderFrequency
FROM
    VendorAnalysis
ORDER BY
    NumOrders DESC;