In [0]:
%sql
create schema gold

In [0]:
%sql
use schema silver

In [0]:
customer_analysis.write.format("delta").mode("overwrite").saveAsTable("gold.customer_analysis")

In [0]:
%sql
WITH VendorOrderTotals AS (
    SELECT
        o.VendorID,
        COUNT(DISTINCT o.OrderID) AS NumOrders,
        SUM(oi.Quantity) AS TotalItems,
        MIN(o.OrderDate) AS FirstOrderDate,
        MAX(o.OrderDate) AS LastOrderDate
    FROM
        Orders o
        JOIN Order_Items oi ON o.OrderID = oi.OrderID
    GROUP BY
        o.VendorID
),
VendorAnalysis AS (
    SELECT
        vot.VendorID,
        v.VendorName,
        vot.NumOrders,
        vot.TotalItems / vot.NumOrders AS AvgItemsPerOrder,
        DATEDIFF(vot.LastOrderDate, vot.FirstOrderDate) / NULLIF(vot.NumOrders - 1, 0) AS OrderFrequency
    FROM
        VendorOrderTotals vot
        JOIN Vendors v ON vot.VendorID = v.VendorID
)
SELECT
    VendorID,
    VendorName,
    NumOrders,
    AvgItemsPerOrder,
    OrderFrequency
FROM
    VendorAnalysis
ORDER BY
    NumOrders DESC;

In [0]:
%sql
WITH OrderTotals AS (
    SELECT
        o.CustomerID,
        COUNT(DISTINCT(o.OrderID)) AS NumOrders,
        SUM(oi.Quantity) AS BasketSize,
        MIN(o.OrderDate) AS FirstOrderDate,
        MAX(o.OrderDate) AS LastOrderDate,
        SUM(oi.Quantity * p.Discounted_Price) AS TotalPurchaseValue -- Calculate the total purchase value
    FROM
        orders o
        JOIN order_items oi ON o.OrderID = oi.OrderID
        JOIN products p ON oi.ProductID = p.Product_ID
    GROUP BY
        o.CustomerID
),
CustomerAnalysis AS (
    SELECT
        ot.CustomerID,
        c.Email,
        ot.NumOrders,
        ROUND(ot.BasketSize / ot.NumOrders, 0) AS AvgBasketSize,
        ROUND(DATEDIFF(ot.LastOrderDate, ot.FirstOrderDate) / NULLIF(ot.NumOrders - 1, 0), 0) AS PurchaseFrequency,
        ot.TotalPurchaseValue, -- Include total purchase value
        ROUND(ot.TotalPurchaseValue / ot.NumOrders, 2) AS AvgBasketValue -- Calculate average basket value
    FROM
        OrderTotals ot
        JOIN customers c ON ot.CustomerID = c.CustomerID
)
SELECT
    CustomerID,
    Email,
    NumOrders,
    AvgBasketSize,
    PurchaseFrequency,
    TotalPurchaseValue,
    AvgBasketValue
FROM
    CustomerAnalysis
ORDER BY
    NumOrders DESC;


In [0]:
from pyspark.sql.functions import col, countDistinct, sum, min, max, datediff, round

# Load data
orders = spark.table("orders")
order_items = spark.table("order_items")
customers = spark.table("customers")
products = spark.table("products")

# Compute Order Totals
order_totals = orders.join(order_items, orders.OrderID == order_items.OrderID) \
    .join(products, order_items.ProductID == products.Product_ID) \
    .groupBy("CustomerID") \
    .agg(
        countDistinct("orders.OrderID").alias("NumOrders"),
        sum("order_items.Quantity").alias("BasketSize"),
        min("orders.OrderDate").alias("FirstOrderDate"),
        max("orders.OrderDate").alias("LastOrderDate"),
        sum(order_items.Quantity * products.Discounted_Price).alias("TotalPurchaseValue")
    )

# Compute Customer Analysis
customer_analysis = order_totals.join(customers, "CustomerID") \
    .select(
        col("CustomerID"),
        col("Email"),
        col("NumOrders"),
        round(col("BasketSize") / col("NumOrders"), 0).alias("AvgBasketSize"),
        round(datediff(col("LastOrderDate"), col("FirstOrderDate")) / (col("NumOrders") - 1), 0).alias("PurchaseFrequency"),
        col("TotalPurchaseValue"),
        round(col("TotalPurchaseValue") / col("NumOrders"), 2).alias("AvgBasketValue")
    )

# Sort and display results
display(customer_analysis.orderBy(col("NumOrders").desc()))

In [0]:
customer_analysis.write.format("delta").mode("overwrite").saveAsTable("gold.customer_analysis")

In [0]:
%sql
WITH CityStateTotals AS (
    SELECT
        a.State,
        a.City,
        COUNT(DISTINCT o.CustomerID) AS TotalCustomers,
        COUNT(DISTINCT o.OrderID) AS TotalOrders,
        COUNT(oi.ProductID) AS TotalProductsSold,
        SUM(CASE WHEN r.OrderID IS NOT NULL THEN oi.Quantity ELSE 0 END) AS TotalProductsReturned,
        COUNT(DISTINCT r.OrderID) * 100.0 / COUNT(DISTINCT o.OrderID) AS ReturnRate,
        SUM(oi.Quantity * p.Discounted_Price) AS TotalRevenue,
        AVG(DATEDIFF(o.ActualDeliveryDate, o.ShippingDate)) AS AvgDeliveryTime
    FROM
        orders o
        JOIN order_items oi ON o.OrderID = oi.OrderID
        LEFT JOIN returns r ON o.OrderID = r.OrderID
        JOIN products p ON oi.ProductID = p.Product_ID
        JOIN addresses a ON o.CustomerID = a.CustomerID
    GROUP BY
        a.State, a.City
),
TopVendorByCityState AS (
    SELECT
        a.State,
        a.City,
        v.VendorName,
        SUM(oi.Quantity * p.Discounted_Price) AS VendorRevenue,
        RANK() OVER (PARTITION BY a.State, a.City ORDER BY SUM(oi.Quantity * p.Discounted_Price) DESC) AS Rank
    FROM
        orders o
        JOIN order_items oi ON o.OrderID = oi.OrderID
        JOIN products p ON oi.ProductID = p.Product_ID
        JOIN vendors v ON o.VendorID = v.VendorID
        JOIN addresses a ON o.CustomerID = a.CustomerID
    GROUP BY
        a.State, a.City, v.VendorName
)
SELECT
    cst.State,
    cst.City,
    cst.TotalCustomers,
    cst.TotalOrders,
    cst.TotalProductsSold AS TotalProductsSold,
    cst.TotalProductsReturned,
    ROUND(cst.ReturnRate, 2) AS ReturnRate,
    cst.TotalRevenue,
    tvb.VendorName AS TopVendorByRevenue,
    ROUND(cst.AvgDeliveryTime, 2) AS AvgDeliveryTime
FROM
    CityStateTotals cst
    LEFT JOIN TopVendorByCityState tvb ON cst.State = tvb.State AND cst.City = tvb.City AND tvb.Rank = 1
ORDER BY
    cst.State, cst.City;


In [0]:

from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Load tables
orders = spark.table("orders")
order_items = spark.table("order_items")
returns = spark.table("returns")
products = spark.table("products")
addresses = spark.table("addresses")
vendors = spark.table("vendors")

# Compute CityStateTotals
city_state_totals = (
    orders
    .join(order_items, "OrderID")
    .join(products, order_items["ProductID"] == products["Product_ID"])
    .join(addresses, "CustomerID")
    .join(returns, "OrderID", "left")
    .groupBy("State", "City")
    .agg(
        F.countDistinct("orders.CustomerID").alias("TotalCustomers"),
        F.countDistinct("orders.OrderID").alias("TotalOrders"),
        F.count("order_items.ProductID").alias("TotalProductsSold"),
        F.sum(F.when(returns["OrderID"].isNotNull(), order_items["Quantity"]).otherwise(0)).alias("TotalProductsReturned"),
        (F.countDistinct("returns.OrderID") * 100.0 / F.countDistinct("orders.OrderID")).alias("ReturnRate"),
        F.sum(order_items["Quantity"] * products["Discounted_Price"]).alias("TotalRevenue"),
        F.avg(F.datediff("orders.ActualDeliveryDate", "orders.ShippingDate")).alias("AvgDeliveryTime")
    )
)

# Compute TopVendorByCityState
window_spec = Window.partitionBy("State", "City").orderBy(F.col("VendorRevenue").desc())
top_vendor_by_city_state = (
    orders
    .join(order_items, "OrderID")
    .join(products, order_items["ProductID"] == products["Product_ID"])
    .join(vendors, "VendorID")
    .join(addresses, "CustomerID")
    .groupBy("State", "City", "VendorName")
    .agg(F.sum(order_items["Quantity"] * products["Discounted_Price"]).alias("VendorRevenue"))
    .withColumn("Rank", F.rank().over(window_spec))
    .filter(F.col("Rank") == 1)
)

# Join results
final_result = (
    city_state_totals
    .join(top_vendor_by_city_state, ["State", "City"], "left")
    .select(
        "State",
        "City",
        "TotalCustomers",
        "TotalOrders",
        "TotalProductsSold",
        "TotalProductsReturned",
        F.round("ReturnRate", 2).alias("ReturnRate"),
        "TotalRevenue",
        F.col("VendorName").alias("TopVendorByRevenue"),
        F.round("AvgDeliveryTime", 2).alias("AvgDeliveryTime")
    )
    .orderBy("State", "City")
)

# Display the final result
display(final_result)

# Save as Delta table
final_result.write.format("delta").mode("overwrite").saveAsTable("gold.Regional_analysis")

In [0]:
%sql
select * from gold.Regional_analysis

In [0]:
%sql
WITH PaymentTotals AS (
    SELECT
        pm.PaymentMethodID,
        pm.MethodName,
        COUNT(DISTINCT p.OrderID) AS TotalOrdersPaid,
        SUM(p.CouponAmount + p.GiftCardAmount) AS TotalPaymentAmount,
        SUM(p.CouponAmount) AS TotalCouponAmount,
        SUM(p.GiftCardAmount) AS TotalGiftCardAmount
    FROM
        payment_methods pm
        JOIN payments p ON pm.PaymentMethodID = p.PaymentMethodID
    GROUP BY
        pm.PaymentMethodID, pm.MethodName
),
CouponUsage AS (
    SELECT
        pm.PaymentMethodID,
        COUNT(CASE WHEN p.CouponUsage = 'Yes' THEN 1 ELSE NULL END) AS TotalCouponUsage
    FROM
        payment_methods pm
        JOIN payments p ON pm.PaymentMethodID = p.PaymentMethodID
    GROUP BY
        pm.PaymentMethodID
),
GiftCardUsage AS (
    SELECT
        pm.PaymentMethodID,
        COUNT(CASE WHEN p.GiftCardUsage = 'Yes' THEN 1 ELSE NULL END) AS TotalGiftCardUsage
    FROM
        payment_methods pm
        JOIN payments p ON pm.PaymentMethodID = p.PaymentMethodID
    GROUP BY
        pm.PaymentMethodID
),
PaymentAnalysis AS (
    SELECT
        pt.PaymentMethodID,
        pt.MethodName,
        pt.TotalOrdersPaid,
        pt.TotalPaymentAmount,
        pt.TotalCouponAmount,
        pt.TotalGiftCardAmount,
        cu.TotalCouponUsage,
        gcu.TotalGiftCardUsage
    FROM
        PaymentTotals pt
        LEFT JOIN CouponUsage cu ON pt.PaymentMethodID = cu.PaymentMethodID
        LEFT JOIN GiftCardUsage gcu ON pt.PaymentMethodID = gcu.PaymentMethodID
)
SELECT
    PaymentMethodID,
    MethodName,
    TotalOrdersPaid,
    TotalPaymentAmount,
    COALESCE(TotalCouponUsage, 0) AS TotalCouponUsed,
    TotalCouponAmount,
    COALESCE(TotalGiftCardUsage, 0) AS TotalGiftCardUsed,
    TotalGiftCardAmount
FROM
    PaymentAnalysis
ORDER BY
    TotalOrdersPaid DESC;


In [0]:
# Load tables
payment_methods = spark.table("payment_methods")
payments = spark.table("payments")

# Compute PaymentTotals
payment_totals = (
    payment_methods
    .join(payments, "PaymentMethodID")
    .groupBy("PaymentMethodID", "MethodName")
    .agg(
        F.countDistinct("OrderID").alias("TotalOrdersPaid"),
        F.sum(F.col("CouponAmount") + F.col("GiftCardAmount")).alias("TotalPaymentAmount"),
        F.sum("CouponAmount").alias("TotalCouponAmount"),
        F.sum("GiftCardAmount").alias("TotalGiftCardAmount")
    )
)

# Compute CouponUsage
coupon_usage = (
    payment_methods
    .join(payments, "PaymentMethodID")
    .groupBy("PaymentMethodID")
    .agg(
        F.count(F.when(F.col("CouponUsage") == 'Yes', 1)).alias("TotalCouponUsage")
    )
)

# Compute GiftCardUsage
gift_card_usage = (
    payment_methods
    .join(payments, "PaymentMethodID")
    .groupBy("PaymentMethodID")
    .agg(
        F.count(F.when(F.col("GiftCardUsage") == 'Yes', 1)).alias("TotalGiftCardUsage")
    )
)

# Compute PaymentAnalysis
payment_analysis = (
    payment_totals
    .join(coupon_usage, "PaymentMethodID", "left")
    .join(gift_card_usage, "PaymentMethodID", "left")
    .select(
        "PaymentMethodID",
        "MethodName",
        "TotalOrdersPaid",
        "TotalPaymentAmount",
        "TotalCouponAmount",
        "TotalGiftCardAmount",
        F.coalesce("TotalCouponUsage", F.lit(0)).alias("TotalCouponUsed"),
        F.coalesce("TotalGiftCardUsage", F.lit(0)).alias("TotalGiftCardUsed")
    )
    .orderBy(F.col("TotalOrdersPaid").desc())
)

# Display the final result
display(payment_analysis)

# Save as Delta table
payment_analysis.write.format("delta").mode("overwrite").saveAsTable("gold.payment_analysis")

In [0]:
%sql
select * from gold.payment_analysis