In [0]:
spark

In [0]:
from pyspark.sql.functions import *
# Part 2: Spark Tasks 
# 1. Ingest the CSV files into two PySpark DataFrames
customersdf = spark.read.csv("file:/Workspace/Shared/customers.csv",header=True,inferSchema=True)
ordersdf = spark.read.csv("file:/Workspace/Shared/orders.csv",header=True,inferSchema=True)
customersdf.show()
ordersdf.show()

# 2. Infer schema and print the schema for both
customersdf.printSchema()
ordersdf.printSchema()

# 3. Add a column TotalAmount = Quantity * Price to orders
ordersdf = ordersdf.withColumn("TotalAmount", col("Quantity") * col("Price"))

# 4. Join both DataFrames on CustomerID
joined = ordersdf.join(customersdf, on="CustomerID", how="inner")
joined.show()

# 5. Filter orders where TotalAmount > 20000
joined.filter(col("TotalAmount") > 20000).show()

# 6. Show customers who placed more than 1 order
joined.groupBy(customersdf.CustomerID).count().filter(col("count") > 1).show()

# 7. Group orders by City and get average order value
joined.groupBy(joined.City).agg(avg(joined.TotalAmount)).show()

# 8. Sort orders by OrderDate in descending order
sor=joined.withColumn("OrderDate", to_date(col("OrderDate"))).orderBy(col("OrderDate").desc())
sor.show()

# 9. Write the final result as a Parquet file partitioned by City
sor.write.mode("overwrite").partitionBy("City").parquet("file:/Workspace/Shared/parquet_by_city")

# 10. Create a temporary view and run Spark SQL:
sor.createOrReplaceTempView("Customerdata")
spark.sql("select * from Customerdata").show()

# Total sales by customer
spark.sql("""
SELECT CustomerID, Name, SUM(TotalAmount) AS TotalSales
FROM Customerdata
GROUP BY CustomerID, Name
""").show()

# Count of products per city
spark.sql("""
SELECT City, COUNT(DISTINCT Product) AS ProductCount
FROM Customerdata
GROUP BY City
""").show()

# Top 2 cities by revenue
spark.sql("""
SELECT City, SUM(TotalAmount) AS TotalRevenue
FROM Customerdata
GROUP BY City
ORDER BY TotalRevenue DESC
LIMIT 2
""").show()



+----------+-----+---------+---+
|CustomerID| Name|     City|Age|
+----------+-----+---------+---+
|       101|Aditi|   Mumbai| 28|
|       102|Rohan|    Delhi| 35|
|       103|Meena|Bangalore| 41|
|       104|Kabir|Hyderabad| 30|
|       105| Zoya|  Chennai| 25|
+----------+-----+---------+---+

+-------+----------+-------+--------+-----+----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|
+-------+----------+-------+--------+-----+----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|
|   1002|       102| Mobile|       2|25000|2024-02-10|
|   1003|       103|   Desk|       1|10000|2024-03-15|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|
|   1005|       104|Monitor|       1|12000|2024-04-25|
+-------+----------+-------+--------+-----+----------+

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |--