In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, count, avg, sum as _sum


# 1️⃣ Start Spark session


In [2]:
spark = SparkSession.builder.appName("CustomerOrdersAnalysis").getOrCreate()


# 2️⃣ Read CSV into DataFrames


In [4]:
customers_df = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/customers.csv")


In [5]:
orders_df = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/orders.csv")



# 3️⃣ Print schema


In [6]:
customers_df.printSchema()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)



In [7]:
orders_df.printSchema()


root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)



# 4️⃣ Add TotalAmount column


In [8]:
orders_df = orders_df.withColumn("TotalAmount", col("Quantity") * col("Price"))


# 5️⃣ Join DataFrames


In [9]:
joined_df = orders_df.join(customers_df, on="CustomerID")


# 6️⃣ Filter TotalAmount > 20000


In [10]:
high_value_orders = joined_df.filter(col("TotalAmount") > 20000)
high_value_orders.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|  City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan| Delhi| 35|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+



# 7️⃣ Customers with >1 order


In [11]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number


In [12]:
order_counts = joined_df.groupBy("CustomerID", "Name").count().filter(col("count") > 1)
order_counts.show()


+----------+-----+-----+
|CustomerID| Name|count|
+----------+-----+-----+
|       101|Aditi|    2|
+----------+-----+-----+



# 8️⃣ Group by City and avg order value


In [13]:
avg_order_by_city = joined_df.groupBy("City").agg(avg("TotalAmount").alias("AvgOrderValue"))
avg_order_by_city.show()


+---------+-------------+
|     City|AvgOrderValue|
+---------+-------------+
|Bangalore|      10000.0|
|   Mumbai|      36500.0|
|    Delhi|      50000.0|
|Hyderabad|      12000.0|
+---------+-------------+



# 9️⃣ Sort by OrderDate desc


In [14]:
sorted_orders = joined_df.orderBy(col("OrderDate").desc())
sorted_orders.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       104|   1005|Monitor|       1|12000|2024-04-25|      12000|Kabir|Hyderabad| 30|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Aditi|   Mumbai| 28|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



# 🔟 Write as Parquet partitioned by City


In [15]:
joined_df.write.mode("overwrite").partitionBy("City").parquet("/tmp/final_orders")


# 1️⃣1️⃣ Create temp view


In [16]:
joined_df.createOrReplaceTempView("orders_view")


# Total sales by customer


In [17]:
spark.sql("""
SELECT CustomerID, Name, SUM(TotalAmount) as TotalSales
FROM orders_view
GROUP BY CustomerID, Name
""").show()


+----------+-----+----------+
|CustomerID| Name|TotalSales|
+----------+-----+----------+
|       101|Aditi|     73000|
|       102|Rohan|     50000|
|       103|Meena|     10000|
|       104|Kabir|     12000|
+----------+-----+----------+



# Count of products per city


In [18]:
spark.sql("""
SELECT City, COUNT(Product) as ProductCount
FROM orders_view
GROUP BY City
""").show()


+---------+------------+
|     City|ProductCount|
+---------+------------+
|Bangalore|           1|
|   Mumbai|           2|
|    Delhi|           1|
|Hyderabad|           1|
+---------+------------+



# Top 2 cities by revenue


In [19]:
spark.sql("""
SELECT City, SUM(TotalAmount) as Revenue
FROM orders_view
GROUP BY City
ORDER BY Revenue DESC
LIMIT 2
""").show()


+------+-------+
|  City|Revenue|
+------+-------+
|Mumbai|  73000|
| Delhi|  50000|
+------+-------+

