Part 1: Environment Setup

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("ProductSalesAnalysis") \
    .getOrCreate()
spark

Part 2: Load Sales Data from CSV

In [None]:
csv_data="""OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North
"""
with open('sales.csv','w') as f:
  f.write(csv_data)

df=spark.read.csv('sales.csv', header=True, inferSchema=True)
df.show(5)
df.printSchema()

Part 3: Business Questions
1. Add a new column TotalPrice = Quantity × UnitPrice

In [None]:
df = df.withColumn("TotalPrice", df["Quantity"] * df["UnitPrice"])
df.show()

2. Total revenue generated across all regions.

In [None]:
from pyspark.sql.functions import sum as _sum
total_revenue = df.agg(_sum("TotalPrice").alias("TotalRevenue"))
total_revenue.show()

3. Category-wise revenue sorted in descending order.

In [None]:
category_df=df.groupBy("Category").sum("TotalPrice").orderBy("sum(TotalPrice)", ascending=False)
category_df.show()


4. Region with the highest number of orders

In [None]:
reg_df=df.groupBy("Region").count().orderBy("count", ascending=False)
reg_df.show(1)


5. Average Unit Price per Category

In [None]:
avg=df.groupBy("Category").avg("UnitPrice")
avg.show()


6. All orders where TotalPrice is more than
30,000

In [None]:
df.filter(df["TotalPrice"] > 30000).show()


Part 4: Data Transformations
1. Create a new column HighValueOrder which is "Yes" if TotalPrice > 20,000,
else "No" .

In [None]:
from pyspark.sql.functions import when, col
df = df.withColumn(
    "HighValueOrder",
    when(col("TotalPrice") > 20000, "Yes").otherwise("No")
)
df.show()

2. Filter and display all high-value orders in the North region.

In [None]:
high_value= df.filter((col("HighValueOrder") == "Yes") & (col("Region") == "North"))
high_value.show()

3. Count how many high-value orders exist per region.

In [None]:
df.filter(col("HighValueOrder") == "Yes") \
  .groupBy("Region") \
  .count() \
  .show()

Part 5: Save Results


In [33]:
df.coalesce(1).write.csv("high_value_orders", header=True, mode="overwrite")