In [0]:
######## Sales Over Time  #######
##Analyze sales trends by month and year##

from pyspark.sql.functions import year, month, sum

# Read the data
df = spark.read.format("delta").load("/FileStore/delta/superstore_transformed")

# Extract Year and Month if not already present
df = df.withColumn('Order_Year', year(df['Order_Date'])).withColumn('Order_Month', month(df['Order_Date']))

# Ensure there are no nulls in critical columns
df = df.filter(df['Sales'].isNotNull() & df['Order_Date'].isNotNull())

# Calculate monthly sales and rename the column for clarity
monthly_sales = df.groupBy('Order_Year', 'Order_Month') \
                  .agg(sum('Sales').alias('Total_Sales')) \
                  .orderBy('Order_Year', 'Order_Month')

# Show the result
monthly_sales.show()

# Write the result to Delta format
monthly_sales.write.format("delta").mode("overwrite").save("/FileStore/delta/monthly_sales")



+----------+-----------+------------------+
|Order_Year|Order_Month|       Total_Sales|
+----------+-----------+------------------+
|      2015|          1|14130.160984992981|
|      2015|          2|4119.8159548044205|
|      2015|          3|55040.987458229065|
|      2015|          4| 27751.07089471817|
|      2015|          5| 23630.68287038803|
|      2015|          6| 34298.34748792648|
|      2015|          7|33336.022790551186|
|      2015|          8|26811.580191135406|
|      2015|          9| 81342.98330289125|
|      2015|         10| 31394.94074845314|
|      2015|         11| 77622.53452861309|
|      2015|         12| 68001.27428495884|
|      2016|          1|17977.997522592545|
|      2016|          2|11924.271917104721|
|      2016|          3|32234.358570575714|
|      2016|          4| 32599.74250805378|
|      2016|          5|29209.414824724197|
|      2016|          6|23461.765960991383|
|      2016|          7|   28377.822920084|
|      2016|          8| 36300.9

In [0]:
###########b. Top Products by Sales#######
# Find top 10 products by sales
from pyspark.sql.functions import col, sum

# Group by Product_Name and calculate total sales
top_products = df.groupBy('Product_Name') \
                 .agg(sum('Sales').alias('Total_Sales')) \
                 .orderBy(col('Total_Sales').desc()) \
                 .limit(10)

# Show the result
top_products.show()

# Write the result to Delta format
top_products.write.format("delta").mode("overwrite").save("/FileStore/delta/top_products")



+--------------------+------------------+
|        Product_Name|       Total_Sales|
+--------------------+------------------+
|Canon imageCLASS ...|   61599.822265625|
|Fellowes PB500 El...|27453.384033203125|
|Cisco TelePresenc...|    22638.48046875|
|HON 5400 Series T...| 21870.57550048828|
|GBC DocuBind TL30...|   19823.478515625|
|GBC Ibimaster 500...|19024.500244140625|
|Hewlett Packard L...|18839.685913085938|
|"HP Designjet T52...|18374.895263671875|
|GBC DocuBind P400...| 17965.06787109375|
|High Speed Automa...|17030.311767578125|
+--------------------+------------------+



In [0]:
df.columns

Out[14]: ['Row_ID',
 'Order_ID',
 'Order_Date',
 'Ship_Date',
 'Ship_Mode',
 'Customer_ID',
 'Customer_Name',
 'Segment',
 'Country',
 'City',
 'State',
 'Postal_Code',
 'Region',
 'Product_ID',
 'Category',
 'Sub-Category',
 'Product_Name',
 'Sales',
 'Order_Month',
 'Order_Year']

In [0]:
################ c. Regional Analysis ##########
from pyspark.sql.functions import col, sum

# Group by 'Region' and calculate total sales
regional_performance = df.groupBy('Region') \
                          .agg(
                              sum('Sales').alias('Total_Sales')
                          )

# Show the result
regional_performance.show()

# Write the result to Delta format
regional_performance.write.format("delta").mode("overwrite").save("/FileStore/delta/regional_performance")



+-------+------------------+
| Region|       Total_Sales|
+-------+------------------+
|  South|386413.13934862614|
|Central|489321.39007872343|
|   East| 663043.8557248116|
|   West| 698354.7733091116|
+-------+------------------+



In [0]:
# Visualize monthly sales trends
display(monthly_sales)


Order_Year,Order_Month,Total_Sales
2015,1,14130.16098499298
2015,2,4119.8159548044205
2015,3,55040.987458229065
2015,4,27751.07089471817
2015,5,23630.68287038803
2015,6,34298.34748792648
2015,7,33336.022790551186
2015,8,26811.580191135406
2015,9,81342.98330289125
2015,10,31394.94074845314
