In [1]:

!rm -rf /content/spark-3.4.1-bin-hadoop3
!rm -rf /content/spark-3.3.0-bin-hadoop3


!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar xf spark-3.3.0-bin-hadoop3.tgz

!pip install -q findspark


In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, desc, sum as _sum

spark = SparkSession.builder.appName("Dataset assignemnt").getOrCreate()


In [3]:

with open("customers.csv", "w") as f:
    f.write("CustomerID,Name,City,Age\n")
    f.write("101,Aditi,Mumbai,28\n")
    f.write("102,Rohan,Delhi,35\n")
    f.write("103,Meena,Bangalore,41\n")
    f.write("104,Kabir,Hyderabad,30\n")
    f.write("105,Zoya,Chennai,25\n")

with open("orders.csv", "w") as f:
    f.write("OrderID,CustomerID,Product,Quantity,Price,OrderDate\n")
    f.write("1001,101,Laptop,1,70000,2024-01-05\n")
    f.write("1002,102,Mobile,2,25000,2024-02-10\n")
    f.write("1003,103,Desk,1,10000,2024-03-15\n")
    f.write("1004,101,Mouse,3,1000,2024-04-01\n")
    f.write("1005,104,Monitor,1,12000,2024-04-25\n")


In [4]:
customers_df = spark.read.csv("customers.csv", header=True, inferSchema=True)
orders_df = spark.read.csv("orders.csv", header=True, inferSchema=True)

customers_df.printSchema()
orders_df.printSchema()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: timestamp (nullable = true)



In [6]:

orders_df = orders_df.withColumn("TotalAmount", col("Quantity") * col("Price"))

joined_df = orders_df.join(customers_df, on="CustomerID", how="inner")

filtered_orders = joined_df.filter(col("TotalAmount") > 20000)

multi_orders = joined_df.groupBy("CustomerID").agg(count("OrderID").alias("OrderCount")) \
    .filter(col("OrderCount") > 1)

avg_order_by_city = joined_df.groupBy("City").agg(avg("TotalAmount").alias("AvgOrderValue"))

sorted_orders = joined_df.orderBy(col("OrderDate").desc())


In [7]:
joined_df.write.mode("overwrite").partitionBy("City").parquet("/content/final_orders_parquet")


In [8]:

joined_df.createOrReplaceTempView("orders_view")

spark.sql("""
    SELECT CustomerID, Name, SUM(TotalAmount) AS TotalSales
    FROM orders_view
    GROUP BY CustomerID, Name
""").show()

# Count of products per city
spark.sql("""
    SELECT City, COUNT(Product) AS ProductCount
    FROM orders_view
    GROUP BY City
""").show()

# Top 2 cities by revenue
spark.sql("""
    SELECT City, SUM(TotalAmount) AS CityRevenue
    FROM orders_view
    GROUP BY City
    ORDER BY CityRevenue DESC
    LIMIT 2
""").show()


+----------+-----+----------+
|CustomerID| Name|TotalSales|
+----------+-----+----------+
|       101|Aditi|     73000|
|       102|Rohan|     50000|
|       103|Meena|     10000|
|       104|Kabir|     12000|
+----------+-----+----------+

+---------+------------+
|     City|ProductCount|
+---------+------------+
|Bangalore|           1|
|   Mumbai|           2|
|    Delhi|           1|
|Hyderabad|           1|
+---------+------------+

+------+-----------+
|  City|CityRevenue|
+------+-----------+
|Mumbai|      73000|
| Delhi|      50000|
+------+-----------+

