In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark=SparkSession.builder.appName("Milestone_Assessment").getOrCreate()

PHASE 1 — DATA INGESTION & SCHEMA

Tasks
1. Define an explicit schema
2. Create a DataFrame using the schema
3. Print schema and validate data types

DATASET — ONLINE ORDERS

In [2]:
orders_data = [
("O001","Delhi ","Laptop","45000","2024-01-05","Completed"),
("O002","Mumbai","Mobile ","32000","05/01/2024","Completed"),
("O003","Bangalore","Tablet","30000","2024/01/06","Completed"),
("O004","Delhi","Laptop","","2024-01-07","Cancelled"),
("O005","Mumbai","Mobile","invalid","2024-01-08","Completed"),
("O006","Chennai","Tablet",None,"2024-01-08","Completed"),
("O007","Delhi","Laptop","47000","09-01-2024","Completed"),
("O008","Bangalore","Mobile","28000","2024-01-09","Completed"),
("O009","Mumbai","Laptop","55000","2024-01-10","Completed"),
("O009","Mumbai","Laptop","55000","2024-01-10","Completed")
]

schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("product", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status", StringType(), True)
])

orders_df = spark.createDataFrame(data=orders_data, schema=schema)
orders_df.show()
orders_df.printSchema()


+--------+---------+-------+-------+----------+---------+
|order_id|     city|product| amount|order_date|   status|
+--------+---------+-------+-------+----------+---------+
|    O001|   Delhi | Laptop|  45000|2024-01-05|Completed|
|    O002|   Mumbai|Mobile |  32000|05/01/2024|Completed|
|    O003|Bangalore| Tablet|  30000|2024/01/06|Completed|
|    O004|    Delhi| Laptop|       |2024-01-07|Cancelled|
|    O005|   Mumbai| Mobile|invalid|2024-01-08|Completed|
|    O006|  Chennai| Tablet|   NULL|2024-01-08|Completed|
|    O007|    Delhi| Laptop|  47000|09-01-2024|Completed|
|    O008|Bangalore| Mobile|  28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
+--------+---------+-------+-------+----------+---------+

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (null

PHASE 2 — DATA CLEANING

Tasks

4. Trim all string columns
5. Standardize city and product values
6. Convert amount to IntegerType
7. Handle invalid and null amount values
8. Remove duplicate orders
9. Keep only Completed order

In [4]:
#4
for c in orders_df.columns:
    df = orders_df.withColumn(c, trim(col(c)))
df.show()

+--------+---------+-------+-------+----------+---------+
|order_id|     city|product| amount|order_date|   status|
+--------+---------+-------+-------+----------+---------+
|    O001|   Delhi | Laptop|  45000|2024-01-05|Completed|
|    O002|   Mumbai|Mobile |  32000|05/01/2024|Completed|
|    O003|Bangalore| Tablet|  30000|2024/01/06|Completed|
|    O004|    Delhi| Laptop|       |2024-01-07|Cancelled|
|    O005|   Mumbai| Mobile|invalid|2024-01-08|Completed|
|    O006|  Chennai| Tablet|   NULL|2024-01-08|Completed|
|    O007|    Delhi| Laptop|  47000|09-01-2024|Completed|
|    O008|Bangalore| Mobile|  28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
+--------+---------+-------+-------+----------+---------+



In [5]:
#5
df = df.withColumn("city", initcap(lower(col("city")))).withColumn("product", initcap(lower(col("product"))))
df.show()

+--------+---------+-------+-------+----------+---------+
|order_id|     city|product| amount|order_date|   status|
+--------+---------+-------+-------+----------+---------+
|    O001|   Delhi | Laptop|  45000|2024-01-05|Completed|
|    O002|   Mumbai|Mobile |  32000|05/01/2024|Completed|
|    O003|Bangalore| Tablet|  30000|2024/01/06|Completed|
|    O004|    Delhi| Laptop|       |2024-01-07|Cancelled|
|    O005|   Mumbai| Mobile|invalid|2024-01-08|Completed|
|    O006|  Chennai| Tablet|   NULL|2024-01-08|Completed|
|    O007|    Delhi| Laptop|  47000|09-01-2024|Completed|
|    O008|Bangalore| Mobile|  28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
+--------+---------+-------+-------+----------+---------+



In [6]:
#6
df = df.withColumn("amount",when(col("amount").rlike("^[0-9]+$"), col("amount").cast(IntegerType())))
df.show()

+--------+---------+-------+------+----------+---------+
|order_id|     city|product|amount|order_date|   status|
+--------+---------+-------+------+----------+---------+
|    O001|   Delhi | Laptop| 45000|2024-01-05|Completed|
|    O002|   Mumbai|Mobile | 32000|05/01/2024|Completed|
|    O003|Bangalore| Tablet| 30000|2024/01/06|Completed|
|    O004|    Delhi| Laptop|  NULL|2024-01-07|Cancelled|
|    O005|   Mumbai| Mobile|  NULL|2024-01-08|Completed|
|    O006|  Chennai| Tablet|  NULL|2024-01-08|Completed|
|    O007|    Delhi| Laptop| 47000|09-01-2024|Completed|
|    O008|Bangalore| Mobile| 28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
+--------+---------+-------+------+----------+---------+



In [7]:
#7
df = df.filter(col("amount").isNotNull())
df.show()

+--------+---------+-------+------+----------+---------+
|order_id|     city|product|amount|order_date|   status|
+--------+---------+-------+------+----------+---------+
|    O001|   Delhi | Laptop| 45000|2024-01-05|Completed|
|    O002|   Mumbai|Mobile | 32000|05/01/2024|Completed|
|    O003|Bangalore| Tablet| 30000|2024/01/06|Completed|
|    O007|    Delhi| Laptop| 47000|09-01-2024|Completed|
|    O008|Bangalore| Mobile| 28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
+--------+---------+-------+------+----------+---------+



In [8]:
#8
df = df.dropDuplicates(["order_id"])
df.show()

+--------+---------+-------+------+----------+---------+
|order_id|     city|product|amount|order_date|   status|
+--------+---------+-------+------+----------+---------+
|    O001|   Delhi | Laptop| 45000|2024-01-05|Completed|
|    O002|   Mumbai|Mobile | 32000|05/01/2024|Completed|
|    O003|Bangalore| Tablet| 30000|2024/01/06|Completed|
|    O007|    Delhi| Laptop| 47000|09-01-2024|Completed|
|    O008|Bangalore| Mobile| 28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
+--------+---------+-------+------+----------+---------+



In [9]:
#9
df_clean = df.filter(col("status") == "Completed")
df_clean.show()

+--------+---------+-------+------+----------+---------+
|order_id|     city|product|amount|order_date|   status|
+--------+---------+-------+------+----------+---------+
|    O001|   Delhi | Laptop| 45000|2024-01-05|Completed|
|    O002|   Mumbai|Mobile | 32000|05/01/2024|Completed|
|    O003|Bangalore| Tablet| 30000|2024/01/06|Completed|
|    O007|    Delhi| Laptop| 47000|09-01-2024|Completed|
|    O008|Bangalore| Mobile| 28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
+--------+---------+-------+------+----------+---------+



PHASE 3 — BASIC ANALYTICS

Tasks

10. Total revenue per city
11. Total revenue per product
12. Average order value per city

In [10]:
#10
df_clean.groupBy("city").agg(sum("amount").alias("total_revenue")).show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        58000|
|   Mumbai|        87000|
|   Delhi |        45000|
|    Delhi|        47000|
+---------+-------------+



In [11]:
#11
df_clean.groupBy("product").agg(sum("amount").alias("total_revenue")).show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       147000|
|Mobile |        32000|
| Mobile|        28000|
| Tablet|        30000|
+-------+-------------+



In [12]:
#12
df_clean.groupBy("city").agg(avg("amount").alias("avg_order_value")).show()

+---------+---------------+
|     city|avg_order_value|
+---------+---------------+
|Bangalore|        29000.0|
|   Mumbai|        43500.0|
|   Delhi |        45000.0|
|    Delhi|        47000.0|
+---------+---------------+



PHASE 4 — WINDOW FUNCTION

Tasks

13. Rank cities by total revenue
14. Identify top-performing city

In [13]:
#13
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

city_revenue = df_clean.groupBy("city") \
    .agg(sum("amount").alias("total_revenue"))

window_spec = Window.orderBy(col("total_revenue").desc())

city_ranked = city_revenue.withColumn(
    "rank", rank().over(window_spec)
)

city_ranked.show()

+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|   Mumbai|        87000|   1|
|Bangalore|        58000|   2|
|    Delhi|        47000|   3|
|   Delhi |        45000|   4|
+---------+-------------+----+



In [14]:
#14
city_ranked.filter(col("rank") == 1).show()

+------+-------------+----+
|  city|total_revenue|rank|
+------+-------------+----+
|Mumbai|        87000|   1|
+------+-------------+----+



PHASE 5 — PERFORMANCE AWARENESS

Tasks

15. Cache the cleaned DataFrame
16. Run two aggregations and observe behavior
17. Use explain(True) to inspect the plan

In [15]:
#15
df_clean.cache()
df_clean.count()

6

In [17]:
#16
df_clean.groupBy("city").sum("amount").show()
df_clean.groupBy("product").sum("amount").show()

+---------+-----------+
|     city|sum(amount)|
+---------+-----------+
|   Mumbai|      87000|
|   Delhi |      45000|
|Bangalore|      58000|
|    Delhi|      47000|
+---------+-----------+

+-------+-----------+
|product|sum(amount)|
+-------+-----------+
| Laptop|     147000|
|Mobile |      32000|
| Mobile|      28000|
| Tablet|      30000|
+-------+-----------+



In [18]:
#17
df_clean.explain(True)

== Parsed Logical Plan ==
'Filter '`=`('status, Completed)
+- Deduplicate [order_id#0]
   +- Filter isnotnull(amount#71)
      +- Project [order_id#0, city#50, product#51, CASE WHEN RLIKE(amount#3, ^[0-9]+$) THEN cast(amount#3 as int) END AS amount#71, order_date#4, status#30]
         +- Project [order_id#0, city#50, initcap(lower(product#2)) AS product#51, amount#3, order_date#4, status#30]
            +- Project [order_id#0, initcap(lower(city#1)) AS city#50, product#2, amount#3, order_date#4, status#30]
               +- Project [order_id#0, city#1, product#2, amount#3, order_date#4, trim(status#5, None) AS status#30]
                  +- LogicalRDD [order_id#0, city#1, product#2, amount#3, order_date#4, status#5], false

== Analyzed Logical Plan ==
order_id: string, city: string, product: string, amount: int, order_date: string, status: string
Filter (status#30 = Completed)
+- Deduplicate [order_id#0]
   +- Filter isnotnull(amount#71)
      +- Project [order_id#0, city#50, product