<h4>Monthly revenue trend for the entire dataset.</h4>

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
df_Silver=spark.read.parquet("/Volumes/workspace/default/abc/silver/")

In [0]:
df_Silver.columns

In [0]:
monthly_revenue_trend=(
    df_Silver.
    groupBy('InvoiceYear','InvoiceMonth')
    .agg(F.sum('TotalPrice').alias('MonthlyRevenue'))
    .orderBy('InvoiceYear','InvoiceMonth')
)
display(monthly_revenue_trend)

<h4>Most popular product category per month.</h4>

In [0]:
df_monthly = df_Silver.withColumn("YearMonth", F.date_format("InvoiceDate", "yyyy-MM"))


In [0]:
monthly_product_sales=(
    df_monthly
    .groupBy('YearMonth','Description')
    .agg(F.sum('Quantity').alias('QuantitySold'))
    
)

In [0]:
window_spec=Window.partitionBy('YearMonth').orderBy(F.desc('QuantitySold'))

top_products=(
    monthly_product_sales.
    withColumn('rank',F.rank().over(window_spec))
    .filter(F.col('rank')==1)
    .select('YearMonth','Description','QuantitySold')
)
display(top_products)

<h4>Average order value per customer</h4>

In [0]:
customer_purchase_count=(df_Silver.
groupby('CustomerID')
.agg(F.sum('TotalPrice').alias('TotalSpend')
,F.countDistinct('InvoiceNo').alias('TotalPurchases')
)
)




In [0]:
customer_average_order=(customer_purchase_count.
withColumn('AverageOrderValue',F.col('TotalSpend')/F.col('TotalPurchases'))
)
customer_average_order.show()

<h4>Identify products with declining sales trends month-over-month.</h4>

In [0]:
monthly_sales = (
    df_monthly
    .groupBy("Description", "YearMonth")
    .agg(F.sum("Quantity").alias("TotalQuantity"))
)


In [0]:
windowSpec = Window.partitionBy("Description").orderBy("YearMonth")


In [0]:
monthly_sales_change = (
    monthly_sales
    .withColumn("PrevMonthSales", F.lag("TotalQuantity").over(windowSpec))
    .withColumn(
        "MoM_Change",
        F.when(F.col("PrevMonthSales").isNotNull(),
               F.col("TotalQuantity") - F.col("PrevMonthSales"))
         .otherwise(None)
    )
)


In [0]:
declining_sales = monthly_sales_change.filter(F.col("MoM_Change") < 0)


In [0]:
display(declining_sales.show(20,truncate=False))

In [0]:
df_gold = (
    df_Silver
    .groupBy("CustomerID")
    .agg(
        F.countDistinct("InvoiceNo").alias("TotalOrders"),
        F.sum("TotalPrice").alias("TotalSpend")
    )
    .withColumn("AvgOrderValue", F.round(F.col("TotalSpend") / F.col("TotalOrders"), 2))
)


In [0]:
gold_path="/Volumes/workspace/default/abc/gold/customer_summary"
df_gold.write.mode("overwrite").format("delta").save(gold_path)