In [0]:
# Load the dataset into a DataFrame
df = spark.read.format("delta").load("/FileStore/delta/superstore_transformed")

# Display schema to understand the data structure
df.printSchema()



root
 |-- Row_ID: integer (nullable = true)
 |-- Order_ID: string (nullable = true)
 |-- Order_Date: date (nullable = true)
 |-- Ship_Date: date (nullable = true)
 |-- Ship_Mode: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal_Code: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Sales: float (nullable = true)
 |-- Order_Month: integer (nullable = true)
 |-- Order_Year: integer (nullable = true)



In [0]:
####Aggregating Total Sales by Product Category####

from pyspark.sql.functions import sum

# Aggregating total sales by category
total_sales_by_category = df.groupBy("Category").agg(
    sum("Sales").alias("Total_Sales")
)

# Show the aggregated result
total_sales_by_category.show()

# Storing aggregated data (e.g., total sales by category) in a Delta table
total_sales_by_category.write.format("delta").mode("overwrite").save("/FileStore/delta/aggregated_sales_by_category")



+---------------+-----------------+
|       Category|      Total_Sales|
+---------------+-----------------+
|Office Supplies|690139.7993411422|
|      Furniture|719791.4541658163|
|     Technology|827201.9049543142|
+---------------+-----------------+



In [0]:
######Aggregating Total Sales by Region#######

# Aggregating total sales by region
total_sales_by_region = df.groupBy("Region").agg(
    sum("Sales").alias("Total_Sales")
)

# Show the aggregated result
total_sales_by_region.show()

# Storing aggregated data (e.g., total sales by region) in a Delta table
total_sales_by_region.write.format("delta").mode("overwrite").save("/FileStore/delta/total_sales_by_region")


+-------+------------------+
| Region|       Total_Sales|
+-------+------------------+
|  South|386413.13934862614|
|Central|489321.39007872343|
|   East| 663043.8557248116|
|   West| 698354.7733091116|
+-------+------------------+



In [0]:
######Average Sales by Customer Segment#####

from pyspark.sql.functions import avg

# Aggregating average sales by segment
average_sales_by_segment = df.groupBy("Segment").agg(
    avg("Sales").alias("Average_Sales")
)

# Show the result
average_sales_by_segment.show()


# Storing aggregated data (e.g., average sales by segment) in a Delta table
average_sales_by_segment.write.format("delta").mode("overwrite").save("/FileStore/delta/average_sales_by_segment")




+-----------+------------------+
|    Segment|     Average_Sales|
+-----------+------------------+
|   Consumer| 229.2131426626457|
|Home Office|248.55639851867377|
|  Corporate| 237.9783451178598|
+-----------+------------------+



In [0]:
########Count of Orders per Customer#####
from pyspark.sql.functions import count
# Aggregating count of orders by customer
orders_per_customer = df.groupBy("Customer_ID").agg(
    count("Order_ID").alias("Order_Count")
)

# Show the result
orders_per_customer.show()


# Storing aggregated data (e.g., orders_per_customer) in a Delta table
orders_per_customer.write.format("delta").mode("overwrite").save("/FileStore/delta/orders_per_customer")


+-----------+-----------+
|Customer_ID|Order_Count|
+-----------+-----------+
|   VW-21775|         18|
|   RR-19315|          4|
|   PB-19210|          2|
|   MY-17380|         13|
|   MS-17530|          7|
|   EM-13960|          6|
|   AH-10690|         23|
|   SW-20275|          7|
|   KH-16630|         17|
|   BD-11500|         10|
|   JF-15490|         14|
|   PH-18790|          2|
|   JF-15415|         13|
|   PW-19240|         12|
|   IM-15070|         21|
|   KM-16225|         19|
|   NW-18400|         22|
|   KF-16285|         18|
|   JH-15985|         14|
|   OT-18730|         10|
+-----------+-----------+
only showing top 20 rows



In [0]:
##########Multiple Aggregations (e.g., Total Sales, Average Sales)##########


# Aggregating both total and average sales by category
sales_by_category = df.groupBy("Category").agg(
    sum("Sales").alias("Total_Sales"),
    avg("Sales").alias("Average_Sales")
)

# Show the result
sales_by_category.show()

# Storing aggregated data (e.g., sales_by_category) in a Delta table
sales_by_category.write.format("delta").mode("overwrite").save("/FileStore/delta/sales_by_category")

+---------------+-----------------+------------------+
|       Category|      Total_Sales|     Average_Sales|
+---------------+-----------------+------------------+
|Office Supplies|690139.7993411422| 121.7177776615771|
|      Furniture|719791.4541658163|354.05383874363815|
|     Technology|827201.9049543142|458.28360385280564|
+---------------+-----------------+------------------+



In [0]:
###############Grouping and Aggregating Using SQL###########

# Register the DataFrame as a temporary view for SQL queries
df.createOrReplaceTempView("superstore_view")

# Run SQL query to aggregate total sales by category
Group_aggregate_result = spark.sql("""
    SELECT Category, SUM(Sales) AS Total_Sales
    FROM superstore_view
    GROUP BY Category
""")

# Show the result
Group_aggregate_result.show()

Group_aggregate_result.write.format("delta").mode("overwrite").save("/FileStore/delta/Group_aggregate_result")


+---------------+-----------------+
|       Category|      Total_Sales|
+---------------+-----------------+
|Office Supplies|690139.7993411422|
|      Furniture|719791.4541658163|
|     Technology|827201.9049543142|
+---------------+-----------------+

