In [1]:
# Part 1
# Starting a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder \
      .appName("PySparkBasics") \
      .getOrCreate()

# confirm running
spark

In [2]:
# Part 2
csv_data = """
OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North
"""

with open('sales.csv','w') as file:
  file.write(csv_data)

In [3]:
df = spark.read.csv('sales.csv', header=True, inferSchema=True)
df.show(5)

+-------+-------+-----------+--------+---------+------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|
+-------+-------+-----------+--------+---------+------+
|   1001| Mobile|Electronics|       2|    15000| North|
|   1002| Laptop|Electronics|       1|    55000| South|
|   1003|T-Shirt|    Apparel|       3|      500|  East|
|   1004|  Jeans|    Apparel|       2|     1200| North|
|   1005|     TV|Electronics|       1|    40000|  West|
+-------+-------+-----------+--------+---------+------+
only showing top 5 rows



In [10]:
# Part 3
# 3.1
from pyspark.sql.functions import col
df1 = df.withColumn("TotalPrice",col('Quantity')*col('UnitPrice'))
df1.show()

+-------+----------+-----------+--------+---------+------+----------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+----------+-----------+--------+---------+------+----------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|
+-------+----------+-----------+--------+---------+------+----------+



In [14]:
# 3.2
from pyspark.sql.functions import sum,avg

Sale_per_region = df1.groupBy("Region").sum("TotalPrice")
Sale_per_region.show()

+------+---------------+
|Region|sum(TotalPrice)|
+------+---------------+
| South|          63000|
|  East|           7500|
|  West|          40000|
| North|          39900|
+------+---------------+



In [19]:
# 3.3 Category-wise revenue sorted in descending order.
# We cannot use only groupby we have to pair it with aggregate functions
revenue_per_category = df1.groupby("Category").agg(sum("TotalPrice").alias("revenue")).sort("revenue",ascending=False)
revenue_per_category.show()

+-----------+-------+
|   Category|revenue|
+-----------+-------+
|Electronics| 132500|
|   Footwear|   8000|
|Accessories|   6000|
|    Apparel|   3900|
+-----------+-------+



In [31]:
# 3.4. Region with the highest number of orders
from pyspark.sql.functions import count, desc

df1.groupBy("Region").agg(count("OrderID").alias("OrderCount")).orderBy(desc("OrderCount")).show(1)


+------+----------+
|Region|OrderCount|
+------+----------+
| North|         3|
+------+----------+
only showing top 1 row



In [33]:
# 3.5. Average Unit Price per Category
from pyspark.sql.functions import avg
df1.groupBy("Category").agg(avg("UnitPrice").alias("AverageUnitPrice")).show()

+-----------+----------------+
|   Category|AverageUnitPrice|
+-----------+----------------+
|    Apparel|           850.0|
|Electronics|         28125.0|
|   Footwear|          2000.0|
|Accessories|          3000.0|
+-----------+----------------+



In [32]:
# 3.6
df1.filter(df1["TotalPrice"]>30000).show()

+-------+-------+-----------+--------+---------+------+----------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+-------+-----------+--------+---------+------+----------+
|   1002| Laptop|Electronics|       1|    55000| South|     55000|
|   1005|     TV|Electronics|       1|    40000|  West|     40000|
+-------+-------+-----------+--------+---------+------+----------+



In [34]:
from pyspark.sql.functions import when

high_value = df1.withColumn("HighValueOrder", when(df1["TotalPrice"] > 20000, "Yes").otherwise("No"))
high_value.show()


+-------+----------+-----------+--------+---------+------+----------+--------------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+----------+-----------+--------+---------+------+----------+--------------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|           Yes|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|           Yes|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|            No|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|            No|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|           Yes|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|            No|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|            No|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|            No|
+-------+----------+-----------+--------+---------+------+-------

In [36]:
# Filter and display all high-value orders in the North region.
high_north = high_value.filter((high_value["HighValueOrder"]=="Yes") & (high_value["Region"]=="North"))
high_north.show()

+-------+-------+-----------+--------+---------+------+----------+--------------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+-------+-----------+--------+---------+------+----------+--------------+
|   1001| Mobile|Electronics|       2|    15000| North|     30000|           Yes|
+-------+-------+-----------+--------+---------+------+----------+--------------+



In [37]:
# Count how many high-value orders exist per region.
high_order = high_value.filter(high_value["HighValueOrder"]=="Yes").groupBy("Region").agg(count("*").alias("HighCount"))
high_order.show()

+------+---------+
|Region|HighCount|
+------+---------+
| South|        1|
|  West|        1|
| North|        1|
+------+---------+



In [39]:
high_value.write.csv("High_Value_Order.csv", header=True)
