**Intialize the SparkSession**

In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
      .appName("june-13 exercise")\
      .getOrCreate()
spark

**Spark Tasks**

**Load the Date**

In [0]:
#1.read the csv files
customers=spark.read.option("header",True).option("inferSchema",True)\
    .csv("file:/Workspace/Shared/customers.csv")
customers.show()
orders=spark.read.option("header",True).option("inferSchema",True)\
    .csv("file:/Workspace/Shared/orders.csv")
orders.show()

+----------+-----+---------+---+
|CustomerID| Name|     City|Age|
+----------+-----+---------+---+
|       101|Aditi|   Mumbai| 28|
|       102|Rohan|    Delhi| 35|
|       103|Meena|Hyderabad| 30|
|       104|Kabir|Bangalore| 41|
|       105| Zoya|Hyderabad| 25|
+----------+-----+---------+---+

+-------+----------+---------+-------+--------+-----+----------+
|OrderID|CustomerID|     City|Product|Quantity|Price| OrderDate|
+-------+----------+---------+-------+--------+-----+----------+
|   1001|       101|  Chennai| Laptop|       1|70000|2024-01-05|
|   1002|       102|  Chennai| Mobile|       2|25000|2024-02-10|
|   1003|       103|    Delhi|   Desk|       1|10000|2024-03-15|
|   1004|       101|   Mumbai|  Mouse|       3| 1000|2024-04-01|
|   1005|       104|Hyderabad|Monitor|       1|12000|2024-04-25|
+-------+----------+---------+-------+--------+-----+----------+



In [0]:
#2.display the schema
customers.printSchema()
orders.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)



In [0]:
#3.Add a column TotalAmount = Quantity * Price to orders
from pyspark.sql.functions import col
orders=orders.withColumn("TotalAmount", col("Quantity") * col("Price"))
print('orders after adding TotalAmount column:')
orders.show()

orders after adding TotalAmount column:
+-------+----------+---------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|     City|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+---------+-------+--------+-----+----------+-----------+
|   1001|       101|  Chennai| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102|  Chennai| Mobile|       2|25000|2024-02-10|      50000|
|   1003|       103|    Delhi|   Desk|       1|10000|2024-03-15|      10000|
|   1004|       101|   Mumbai|  Mouse|       3| 1000|2024-04-01|       3000|
|   1005|       104|Hyderabad|Monitor|       1|12000|2024-04-25|      12000|
+-------+----------+---------+-------+--------+-----+----------+-----------+



In [0]:
#4.Join both DataFrames on CustomerID
j=orders.join(customers, on="CustomerID", how="inner")
print('Joined DataFrames:')
j.select("CustomerID", "Name", "Product", "TotalAmount").show()
j.show()

Joined DataFrames:
+----------+-----+-------+-----------+
|CustomerID| Name|Product|TotalAmount|
+----------+-----+-------+-----------+
|       101|Aditi| Laptop|      70000|
|       102|Rohan| Mobile|      50000|
|       103|Meena|   Desk|      10000|
|       101|Aditi|  Mouse|       3000|
|       104|Kabir|Monitor|      12000|
+----------+-----+-------+-----------+

+----------+-------+---------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|     City|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+---------+-------+--------+-----+----------+-----------+-----+---------+---+
|       101|   1001|  Chennai| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
|       102|   1002|  Chennai| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       103|   1003|    Delhi|   Desk|       1|10000|2024-03-15|      10000|Meena|Hyderabad| 30|
|       101|   1004|   Mumbai|  Mouse|       3|

In [0]:
#5. Filter orders where TotalAmount > 20000
a=orders.filter(col("TotalAmount")>20000)
print('orders where TotalAmount > 20000:')
a.show()

orders where TotalAmount > 20000:
+-------+----------+-------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|   City|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+-------+--------+-----+----------+-----------+
|   1001|       101|Chennai| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102|Chennai| Mobile|       2|25000|2024-02-10|      50000|
+-------+----------+-------+-------+--------+-----+----------+-----------+



In [0]:
#6.Show customers who placed more than 1 order
from pyspark.sql.functions import countDistinct
print('customers who placed more than 1 order:')
j.groupBy("CustomerID").agg(countDistinct("OrderID")).filter(col("count(DISTINCT OrderID)")>1).show()

customers who placed more than 1 order:
+----------+-----------------------+
|CustomerID|count(DISTINCT OrderID)|
+----------+-----------------------+
|       101|                      2|
+----------+-----------------------+



In [0]:
from pyspark.sql.functions import count
print('customers who placed more than 1 order:')
j.groupBy("CustomerID", "Name").agg(count("OrderID").alias("OrderCount")) \
    .filter("OrderCount > 1").show()


customers who placed more than 1 order:
+----------+-----+----------+
|CustomerID| Name|OrderCount|
+----------+-----+----------+
|       101|Aditi|         2|
+----------+-----+----------+



In [0]:
#7. Group orders by City and get average order value
from pyspark.sql.functions import avg
print('average order value by City:')
orders.groupBy("City").agg(avg("TotalAmount")).show()

average order value by City:
+---------+----------------+
|     City|avg(TotalAmount)|
+---------+----------------+
|  Chennai|         60000.0|
|   Mumbai|          3000.0|
|    Delhi|         10000.0|
|Hyderabad|         12000.0|
+---------+----------------+



In [0]:
#8.Sort orders by OrderDate in descending order
from pyspark.sql.functions import desc
print('orders sorted by OrderDate in descending order:')
orders.sort(desc("OrderDate")).show()

orders sorted by OrderDate in descending order:
+-------+----------+---------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|     City|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+---------+-------+--------+-----+----------+-----------+
|   1005|       104|Hyderabad|Monitor|       1|12000|2024-04-25|      12000|
|   1004|       101|   Mumbai|  Mouse|       3| 1000|2024-04-01|       3000|
|   1003|       103|    Delhi|   Desk|       1|10000|2024-03-15|      10000|
|   1002|       102|  Chennai| Mobile|       2|25000|2024-02-10|      50000|
|   1001|       101|  Chennai| Laptop|       1|70000|2024-01-05|      70000|
+-------+----------+---------+-------+--------+-----+----------+-----------+



In [0]:
#9.Write the final result as a Parquet file partitioned by city
a.write.partitionBy("City").mode("overwrite").parquet("file:/Workspace/Shared/customer_orders")

In [0]:
# Rename city columns to avoid ambiguity
orders = orders.withColumnRenamed("City", "OrderCity")
customers = customers.withColumnRenamed("City", "CustomerCity")
j = orders.join(customers, on="CustomerID", how="inner")
j.printSchema()
j.show()

root
 |-- CustomerID: integer (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- OrderCity: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- CustomerCity: string (nullable = true)
 |-- Age: integer (nullable = true)

+----------+-------+---------+-------+--------+-----+----------+-----------+-----+------------+---+
|CustomerID|OrderID|OrderCity|Product|Quantity|Price| OrderDate|TotalAmount| Name|CustomerCity|Age|
+----------+-------+---------+-------+--------+-----+----------+-----------+-----+------------+---+
|       101|   1001|  Chennai| Laptop|       1|70000|2024-01-05|      70000|Aditi|      Mumbai| 28|
|       102|   1002|  Chennai| Mobile|       2|25000|2024-02-10|      50000|Rohan|       Delhi| 35|
|       103|   1003|    Delhi|   Desk|       1|10000|

In [0]:
# Create the temp view
j.createOrReplaceTempView("orders_view")
# Total sales by customer
print('Total sales by customer:')
spark.sql("""
SELECT CustomerID, Name, SUM(TotalAmount) AS TotalSales
FROM orders_view
GROUP BY CustomerID, Name
""").show()
#Count of products per city
print('Count of products per order city:')
spark.sql("""
SELECT OrderCity, COUNT(*) AS ProductCount
FROM orders_view
GROUP BY OrderCity
""").show()
# Top 2 order cities by revenue
print('Top 2 order cities by revenue:')
spark.sql("""
SELECT OrderCity, SUM(TotalAmount) AS Revenue
FROM orders_view
GROUP BY OrderCity
ORDER BY Revenue DESC
LIMIT 2
""").show()

Total sales by customer:
+----------+-----+----------+
|CustomerID| Name|TotalSales|
+----------+-----+----------+
|       101|Aditi|     73000|
|       102|Rohan|     50000|
|       103|Meena|     10000|
|       104|Kabir|     12000|
+----------+-----+----------+

Count of products per order city:
+---------+------------+
|OrderCity|ProductCount|
+---------+------------+
|  Chennai|           2|
|   Mumbai|           1|
|    Delhi|           1|
|Hyderabad|           1|
+---------+------------+

Top 2 order cities by revenue:
+---------+-------+
|OrderCity|Revenue|
+---------+-------+
|  Chennai| 120000|
|Hyderabad|  12000|
+---------+-------+

