In [2]:
from google.colab import files

# Open a file picker to upload
uploaded = files.upload()




Saving orders.csv to orders.csv


In [3]:

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("OrdersPipeline") \
    .getOrCreate()

df_raw = spark.read.csv("orders.csv", header=True, inferSchema=False)


In [4]:
df_raw.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



In [5]:

raw_count = df_raw.count()
print("Total Records =", raw_count)


Total Records = 300000


In [6]:
df_raw.show(10, truncate=False)

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|order_id   |customer_id|city       |category   |product    |amount |order_date|status   |
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|C000000    | hyderabad | grocery   |Oil        |invalid|01/01/2024|Cancelled|
|ORD00000001|C000001    |Pune       |Grocery    |Sugar      |35430  |2024-01-02|Completed|
|ORD00000002|C000002    |Pune       |Electronics|Mobile     |65358  |2024-01-03|Completed|
|ORD00000003|C000003    |Bangalore  |Electronics|Laptop     |5558   |2024-01-04|Completed|
|ORD00000004|C000004    |Pune       |Home       |AirPurifier|33659  |2024-01-05|Completed|
|ORD00000005|C000005    |Delhi      |Fashion    |Jeans      |8521   |2024-01-06|Completed|
|ORD00000006|C000006    |Delhi      |Grocery    |Sugar      |42383  |2024-01-07|Completed|
|ORD00000007|C000007    |Pune       |Grocery    |Rice       |45362  |2024-01-08|Completed|

PHASE 2 — DATA CLEANING

In [7]:

from pyspark.sql.functions import trim, col

df = df_raw.withColumn("city", trim(col("city"))) \
           .withColumn("category", trim(col("category"))) \
           .withColumn("product", trim(col("product")))


In [8]:

from pyspark.sql.functions import initcap

df = df.withColumn("city_clean", initcap(col("city"))) \
       .withColumn("category_clean", initcap(col("category"))) \
       .withColumn("product_clean", initcap(col("product")))


In [9]:

from pyspark.sql.functions import regexp_replace, when

df = df.withColumn(
    "amount_clean",
    regexp_replace("amount", ",", "")
)

df = df.withColumn(
    "amount_clean",
    when(col("amount_clean").rlike("^[0-9]+$"), col("amount_clean")).otherwise(None)
)

df = df.withColumn("amount_int", col("amount_clean").cast("int"))


In [17]:
from pyspark.sql.functions import coalesce, to_date, try_to_timestamp, col, lit

df = df.withColumn(
    "order_date_clean",
    coalesce(
        try_to_timestamp(col("order_date"), lit("yyyy-MM-dd")),
        try_to_timestamp(col("order_date"), lit("dd/MM/yyyy")),
        try_to_timestamp(col("order_date"), lit("yyyy/MM/dd"))
    ).cast("date")
)

PHASE 3 — DATA VALIDATION

In [11]:
invalid_amounts = df.filter(col("amount_int").isNull()).count()

In [18]:
from pyspark.sql.functions import to_timestamp

# The previous line 'df = df.withColumn("parsed_date", to_timestamp("date_column", "dd/MM/yyyy"))' was likely a leftover or a test and not intended to be part of the final flow here,
# as it refers to a non-existent 'date_column' and would override the cleaned date column if it were named 'order_date_clean'.
# I'm removing it to avoid introducing new issues and to ensure we're using the 'order_date_clean' column generated in the previous step.
# df = df.withColumn("parsed_date", to_timestamp("date_column", "dd/MM/yyyy"))

invalid_dates = df.filter(col("order_date_clean").isNull()).count()

In [19]:

from pyspark.sql.functions import count

df.groupBy("order_id").agg(count("*").alias("cnt")).filter("cnt > 1").show()


+--------+---+
|order_id|cnt|
+--------+---+
+--------+---+



In [20]:
df = df.dropDuplicates(["order_id"])

In [21]:
df_completed = df.filter(col("status") == "Completed")

In [22]:

print("After cleaning:", df.count())
print("After filtering completed:", df_completed.count())


After cleaning: 300000
After filtering completed: 285000


PHASE 4 — PERFORMANCE ENGINEERING

In [23]:

print(df.rdd.getNumPartitions())

2


In [24]:
city_rev = df.groupBy("city_clean").sum("amount_int")

In [25]:
city_rev.explain(True)

== Parsed Logical Plan ==
'Aggregate ['city_clean], ['city_clean, unresolvedalias('sum(amount_int#79))]
+- Deduplicate [order_id#17]
   +- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, city_clean#74, category_clean#75, product_clean#76, amount_clean#78, amount_int#79, cast(coalesce(try_to_timestamp(order_date#23, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#23, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#23, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS order_date_clean#127]
      +- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, city_clean#74, category_clean#75, product_clean#76, amount_clean#78, amount_int#79, CASE WHEN isnull(order_date_clean#123) THEN to_date(order_date#23, Some(yyyy/MM/dd), Some(Etc/UTC), true) END AS order_date_clean#124]
         +- Project

4. Identify shuffle

GroupBy always causes a shuffle because it requires same-key data to be co-located.

In [26]:
df_rep = df.repartition("city_clean")

In [27]:
df_rep.groupBy("city_clean").sum("amount_int").explain(True)

== Parsed Logical Plan ==
'Aggregate ['city_clean], ['city_clean, unresolvedalias('sum(amount_int#79))]
+- RepartitionByExpression [city_clean#74]
   +- Deduplicate [order_id#17]
      +- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, city_clean#74, category_clean#75, product_clean#76, amount_clean#78, amount_int#79, cast(coalesce(try_to_timestamp(order_date#23, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#23, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#23, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS order_date_clean#127]
         +- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, city_clean#74, category_clean#75, product_clean#76, amount_clean#78, amount_int#79, CASE WHEN isnull(order_date_clean#123) THEN to_date(order_date#23, Some(yyyy/MM/dd), Some(Etc/UTC), tr

PHASE 5 — ANALYTICS

In [28]:

rev_city = df.groupBy("city_clean").sum("amount_int")


In [29]:
rev_cat = df.groupBy("category_clean").sum("amount_int")

In [30]:
avg_city = df.groupBy("city_clean").avg("amount_int")

In [31]:

top_products = df.groupBy("product_clean").sum("amount_int")\
                 .orderBy(col("sum(amount_int)").desc())\
                 .limit(10)


In [32]:
rev_city.orderBy(col("sum(amount_int)").desc())

DataFrame[city_clean: string, sum(amount_int): bigint]

PHASE 6 — WINDOW FUNCTIONS

In [33]:

from pyspark.sql.window import Window
from pyspark.sql.functions import rank, dense_rank


In [34]:

w = Window.orderBy(col("sum(amount_int)").desc())
city_rank = rev_city.withColumn("rank", rank().over(w))


In [35]:

wcat = Window.partitionBy("category_clean").orderBy(col("sum(amount_int)").desc())
prod_rank = df.groupBy("category_clean","product_clean").sum("amount_int") \
              .withColumn("rank", dense_rank().over(wcat))


In [36]:
top_per_cat = prod_rank.filter("rank = 1")

In [37]:
top3_cities = city_rank.filter(col("rank") <= 3)

PHASE 7 — BROADCAST JOIN

In [38]:

data = [
 ("Delhi","North"), ("Mumbai","West"), ("Bangalore","South"),
 ("Hyderabad","South"), ("Pune","West"), ("Chennai","South"), ("Kolkata","East")
]

region_df = spark.createDataFrame(data, ["city","region"])


In [39]:

from pyspark.sql.functions import broadcast

df_joined = df.join(broadcast(region_df), df.city_clean == region_df.city, "left")


In [40]:
df_joined.explain(True)

== Parsed Logical Plan ==
Join LeftOuter, (city_clean#74 = city#616)
:- Deduplicate [order_id#17]
:  +- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, city_clean#74, category_clean#75, product_clean#76, amount_clean#78, amount_int#79, cast(coalesce(try_to_timestamp(order_date#23, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#23, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#23, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS order_date_clean#127]
:     +- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, city_clean#74, category_clean#75, product_clean#76, amount_clean#78, amount_int#79, CASE WHEN isnull(order_date_clean#123) THEN to_date(order_date#23, Some(yyyy/MM/dd), Some(Etc/UTC), true) END AS order_date_clean#124]
:        +- Project [order_id#17, customer_id#18, city

4. Why broadcast join is efficient?

* Lookup table is small
* Sent to every executor once
* Avoids shuffling large orders dataset
* Fastest join method in Spark for small reference tables

PHASE 8 — UDF

In [41]:

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def classify(v):
    if v is None: return "Low"
    if v >= 80000: return "High"
    if v >= 40000: return "Medium"
    return "Low"

classify_udf = udf(classify, StringType())

df = df.withColumn("order_value_category", classify_udf(col("amount_int")))


In [42]:
df.groupBy("order_value_category").count().show()

+--------------------+------+
|order_value_category| count|
+--------------------+------+
|                High| 29371|
|                 Low|153451|
|              Medium|117178|
+--------------------+------+



PHASE 9 — RDD

In [43]:
rdd = df.rdd

In [44]:
total_rev = rdd.map(lambda r: r["amount_int"] or 0).reduce(lambda x,y: x+y)

In [46]:

city_orders = rdd.map(lambda r: (r["city_clean"], 1)).reduceByKey(lambda a,b: a+b)



4. Why DataFrames are better than RDDs?

* Catalyst optimizer
* Tungsten execution engine
* Predicate pushdown
* Columnar optimization
* Much faster
* Safer, less code
* More memory-efficient

PHASE 10 — CACHING

In [47]:
df.cache()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: string, order_date: string, status: string, city_clean: string, category_clean: string, product_clean: string, amount_clean: string, amount_int: int, order_date_clean: date, order_value_category: string]

In [48]:

df.groupBy("city_clean").sum("amount_int").show()
df.groupBy("category_clean").avg("amount_int").show()


+----------+---------------+
|city_clean|sum(amount_int)|
+----------+---------------+
| Bangalore|     1713862477|
|   Chennai|     1714214871|
|    Mumbai|     1712702171|
|   Kolkata|     1708896076|
|      Pune|     1733418439|
|     Delhi|     1719458186|
| Hyderabad|     1731794799|
+----------+---------------+

+--------------+------------------+
|category_clean|   avg(amount_int)|
+--------------+------------------+
|          Home|  43861.3925077651|
|       Fashion| 43661.82383875079|
|       Grocery|43865.714036058605|
|   Electronics| 43759.72683138903|
+--------------+------------------+



In [49]:
df.unpersist()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: string, order_date: string, status: string, city_clean: string, category_clean: string, product_clean: string, amount_clean: string, amount_int: int, order_date_clean: date, order_value_category: string]

PHASE 11 — STORAGE FORMATS

In [50]:
df.write.mode("overwrite").partitionBy("city_clean").parquet("clean_orders_parquet")

In [51]:
rev_city.write.orc("rev_city_orc")

In [52]:

p = spark.read.parquet("clean_orders_parquet")
o = spark.read.orc("rev_city_orc")
p.printSchema()
o.printSchema()


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- category_clean: string (nullable = true)
 |-- product_clean: string (nullable = true)
 |-- amount_clean: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)
 |-- order_value_category: string (nullable = true)
 |-- city_clean: string (nullable = true)

root
 |-- city_clean: string (nullable = true)
 |-- sum(amount_int): long (nullable = true)



4. Parquet/ORC vs CSV

* Columnar
* Compressed
* Much smaller size
* Predicate pushdown
* Way faster for analytics



PHASE 12 — DEBUGGING

Why does this fail?

In [53]:
df = df.filter(df.amount > 50000).show()

{"ts": "2026-01-15 11:19:35.152", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[CAST_INVALID_INPUT] The value 'invalid' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018", "context": {"file": "line 1 in cell [53]", "line": "", "fragment": "__gt__", "errorClass": "CAST_INVALID_INPUT"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o512.showString.\n: org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value 'invalid' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018\n== DataFrame ==\n\"__gt__\" was called from\nline 1 in cell [53]\n\n\tat org.apache.spark.sql.errors.Quer

NumberFormatException: [CAST_INVALID_INPUT] The value 'invalid' of the type "STRING" cannot be cast to "BIGINT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"__gt__" was called from
line 1 in cell [53]


Because:

* .show() returns None
* You assign None back to df
* df is no longer a DataFrame

Correct usage:

In [57]:
from pyspark.sql.functions import trim, col, initcap, regexp_replace, when, coalesce, try_to_timestamp, lit

# Re-apply cleaning steps to ensure df is in a consistent state with amount_int correctly processed
df = df_raw.withColumn("city", trim(col("city"))) \
           .withColumn("category", trim(col("category"))) \
           .withColumn("product", trim(col("product")))

df = df.withColumn("city_clean", initcap(col("city"))) \
       .withColumn("category_clean", initcap(col("category"))) \
       .withColumn("product_clean", initcap(col("product")))

df = df.withColumn(
    "amount_clean",
    regexp_replace("amount", ",", "")
)

df = df.withColumn(
    "amount_clean",
    when(col("amount_clean").rlike("^[0-9]+$"), col("amount_clean")).otherwise(None)
)

df = df.withColumn("amount_int", col("amount_clean").cast("int"))

df = df.withColumn(
    "order_date_clean",
    coalesce(
        try_to_timestamp(col("order_date"), lit("yyyy-MM-dd")),
        try_to_timestamp(col("order_date"), lit("dd/MM/yyyy")),
        try_to_timestamp(col("order_date"), lit("yyyy/MM/dd"))
    ).cast("date")
)

df = df.dropDuplicates(["order_id"])

# Now perform the filter operation
df = df.filter(df.amount_int > 50000)
df.show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+----------+--------------+-------------+------------+----------+----------------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|city_clean|category_clean|product_clean|amount_clean|amount_int|order_date_clean|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+----------+--------------+-------------+------------+----------+----------------+
|ORD00000010|    C000010|Bangalore|    Grocery|      Sugar| 66576|2024-01-11|Completed| Bangalore|       Grocery|        Sugar|       66576|     66576|      2024-01-11|
|ORD00000011|    C000011|  Kolkata|Electronics|     Tablet| 50318|12/01/2024|Completed|   Kolkata|   Electronics|       Tablet|       50318|     50318|      2024-01-12|
|ORD00000012|    C000012|Bangalore|    Grocery|      Sugar| 84768|2024-01-13|Completed| Bangalore|       Grocery|        Sugar|       84768|     84768|    

PHASE 13 — FINAL VALIDATION

In [58]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- city_clean: string (nullable = true)
 |-- category_clean: string (nullable = true)
 |-- product_clean: string (nullable = true)
 |-- amount_clean: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)



In [59]:
df.filter("city_clean IS NULL or category_clean IS NULL or product_clean IS NULL").count

3. Documentation Summary

Cleaning Strategy

* Keep original columns
* Trim + proper case
* Clean amounts safely
* Parse multi-format dates
* Handle invalid rows gently

Performance Strategy

* Partitioning
* Broadcast join
* Cache only when needed
* Avoid shuffles
* File formats Parquet/ORC

Debugging Learnings

* .show() destroys DataFrames
* Always separate transformations and actions
* Schema inference can corrupt data

