In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("Customer Transactions").getOrCreate()

In [5]:
raw_orders = [
("ORD001","C001","Ravi"," Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
("ORD002","C002","Sneha","Mumbai"," Mobile ","Electronics","32000","05/01/2024","Completed"),
("ORD003","C003","Aman","Bangalore","Laptop","Electronics","55000","2024/01/06","Completed"),
("ORD004","C004","Pooja","Delhi","Tablet"," Electronics ","","2024-01-07","Cancelled"),
("ORD005","C005","Neha","Chennai","Laptop","Electronics","48000","invalid_date","Completed"),
("ORD006","C006","Rahul","Mumbai","Mobile","Electronics",None,"2024-01-08","Completed"),
("ORD007","C007","Kiran","Bangalore","Tablet","Electronics","30000","2024-01-08","Completed"),
("ORD008","C008","Amit","Delhi","Laptop","electronics","45000","2024-01-09","Completed"),
("ORD009","C009","Priya"," Pune","Mobile","Electronics","28000","09-01-2024","Completed"),
("ORD010","C010","Suresh","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
("ORD010","C010","Suresh","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),  #duplicate
("ORD011","C011","Meena","Chennai","Tablet","Electronics","31000","2024-01-11","Completed"),
("ORD012","C012","Arjun","Delhi","Mobile","Electronics","27000","2024/01/11","Completed"),
("ORD013","C013","Nikhil","Bangalore","Laptop","Electronics","60000","2024-01-12","Completed"),
("ORD014","C014","Rohit","Mumbai","Mobile","Electronics","invalid_price","2024-01-12","Completed"),
("ORD015","C015","Anita","Delhi","Tablet","Electronics","29000","2024-01-13","Completed"),
("ORD016","C016","Vikas","Chennai","Laptop","Electronics","52000","2024-01-13","Completed"),
("ORD017","C017","Sunita","Mumbai","Mobile","Electronics","33000","2024-01-14","Completed"),
("ORD018","C018","Deepak","Bangalore","Laptop","Electronics","58000","2024-01-14","Completed"),
("ORD019","C019","Pallavi","Delhi","Mobile","Electronics","26000","2024-01-15","Completed"),
("ORD020","C020","Manish","Mumbai","Tablet","Electronics","34000","2024-01-15","Completed")
]

In [6]:
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("city", StringType(), True),
    StructField("product", StringType(), True),
    StructField("category", StringType(), True),
    StructField("price", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("order_status", StringType(), True)
])

In [8]:
df = spark.createDataFrame(raw_orders, schema)
df.show(truncate=False)

+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+
|order_id|customer_id|customer_name|city     |product |category     |price        |order_date  |order_status|
+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+
|ORD001  |C001       |Ravi         | Delhi   |Laptop  |Electronics  |45000        |2024-01-05  |Completed   |
|ORD002  |C002       |Sneha        |Mumbai   | Mobile |Electronics  |32000        |05/01/2024  |Completed   |
|ORD003  |C003       |Aman         |Bangalore|Laptop  |Electronics  |55000        |2024/01/06  |Completed   |
|ORD004  |C004       |Pooja        |Delhi    |Tablet  | Electronics |             |2024-01-07  |Cancelled   |
|ORD005  |C005       |Neha         |Chennai  |Laptop  |Electronics  |48000        |invalid_date|Completed   |
|ORD006  |C006       |Rahul        |Mumbai   |Mobile  |Electronics  |NULL         |2024-01-08  |Completed   |
|ORD007  |

Column Operations
1. Rename all columns to snake_case
2. Add a column price_with_tax (18%)
3. Add a column price_category (Low / Medium / High)

Data Cleaning

4. Trim and standardize city , product , category
5. Convert price to integer
6. Handle invalid and null prices
7. Normalize all dates into DateType
8. Remove duplicate orders
9. Filter only Completed orders

Data Transformation

10. Create order_year , order_month
11. Aggregate total revenue per city
12. Aggregate total revenue per product
13. Identify top 3 cities by revenue
14. Identify products with average price above threshold

File Format Operations

15. Write cleaned data to Parquet
16. Read Parquet back and verify schema
17. Write the same data to ORC
18. (Optional) Write to Avro

Performance & Validation

19. Check number of partitions
20. Repartition before writing
21. Compare file counts between Parquet and ORC
22. Run explain(True) on final pipeline

In [32]:
columns = [
    "order_id",
    "customer_id",
    "customer_name",
    "city",
    "product",
    "category",
    "price",
    "order_date",
    "order_status"
]

df_chg = df.toDF(*columns)
df_chg.show()
df_chg.printSchema()

+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+
|order_id|customer_id|customer_name|     city| product|     category|        price|  order_date|order_status|
+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+
|  ORD001|       C001|         Ravi|   Delhi |  Laptop|  Electronics|        45000|  2024-01-05|   Completed|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile |  Electronics|        32000|  05/01/2024|   Completed|
|  ORD003|       C003|         Aman|Bangalore|  Laptop|  Electronics|        55000|  2024/01/06|   Completed|
|  ORD004|       C004|        Pooja|    Delhi|  Tablet| Electronics |             |  2024-01-07|   Cancelled|
|  ORD005|       C005|         Neha|  Chennai|  Laptop|  Electronics|        48000|invalid_date|   Completed|
|  ORD006|       C006|        Rahul|   Mumbai|  Mobile|  Electronics|         NULL|  2024-01-08|   Completed|
|  ORD007|

In [52]:
#2
from pyspark.sql.functions import when, col
from pyspark.sql.types import DoubleType

df_chg = df_chg.withColumn("price",
                 when(col("price").rlike("^[0-9]+$"),col("price").cast(DoubleType())).otherwise(None))

if "price_with_tax" in df.columns:
    df_chg = df_chg.drop("price_with_tax")

df_chg = df_chg.withColumn("price_with_tax", col("price") * 1.18)

df_chg.show(5)

+--------+-----------+-------------+---------+-------+-----------+-------+------------+------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|  price|  order_date|order_status|price_with_tax|
+--------+-----------+-------------+---------+-------+-----------+-------+------------+------------+--------------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|45000.0|  2024-01-05|   Completed|       53100.0|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|32000.0|  05/01/2024|   Completed|       37760.0|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|55000.0|  2024/01/06|   Completed|       64900.0|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|Electronics|   NULL|  2024-01-07|   Cancelled|          NULL|
|  ORD005|       C005|         Neha|  Chennai| Laptop|Electronics|48000.0|invalid_date|   Completed|       56640.0|
+--------+-----------+-------------+---------+-------+-----------+------

In [53]:
#3
df_chg=df_chg.withColumn("price_category",
                 when(col("price")<30000,"Low")
                 .when(col("price").between(30000,50000),"Medium")
                 .otherwise("High")
                 )
df_chg.show()

+--------+-----------+-------------+---------+-------+-----------+-------+------------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|  price|  order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-------+------------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|45000.0|  2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|32000.0|  05/01/2024|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|55000.0|  2024/01/06|   Completed|       64900.0|          High|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|Electronics|   NULL|  2024-01-07|   Cancelled|          NULL|          High|
|  ORD005|       C005|         Neha|  Chennai| Laptop|Electronics|48000.0|invalid_d

In [54]:
#4
df_chg = df_chg.withColumn("city", trim(initcap(col("city")))) \
       .withColumn("product", trim(initcap(col("product")))) \
       .withColumn("category", trim(initcap(col("category")))) \
       .withColumn("order_status", trim(initcap(col("order_status"))))
df_chg.show()

+--------+-----------+-------------+---------+-------+-----------+-------+------------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|  price|  order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-------+------------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|45000.0|  2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|32000.0|  05/01/2024|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|55000.0|  2024/01/06|   Completed|       64900.0|          High|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|Electronics|   NULL|  2024-01-07|   Cancelled|          NULL|          High|
|  ORD005|       C005|         Neha|  Chennai| Laptop|Electronics|48000.0|invalid_d

In [55]:
#5
from pyspark.sql.types import IntegerType

df_chg = df_chg.withColumn("price", col("price").cast(IntegerType()))
df_chg.show(5)
df_chg.printSchema()

+--------+-----------+-------------+---------+-------+-----------+-----+------------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|price|  order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+------------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|45000|  2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|32000|  05/01/2024|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|55000|  2024/01/06|   Completed|       64900.0|          High|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|Electronics| NULL|  2024-01-07|   Cancelled|          NULL|          High|
|  ORD005|       C005|         Neha|  Chennai| Laptop|Electronics|48000|invalid_date|   Completed

In [56]:
#6
from pyspark.sql.functions import mean, col, when
from pyspark.sql.types import IntegerType, DoubleType

mean_price = df_chg.agg(mean(col("price"))).collect()[0][0]

df_chg = df_chg.withColumn("price", when(col("price").isNull(), mean_price).otherwise(col("price")))

df_chg = df_chg.withColumn("price", col("price").cast(IntegerType()))

df_chg = df_chg.withColumn("price_with_tax", col("price").cast(DoubleType()) * 1.18)

df_chg = df_chg.withColumn("price_category",
                 when(col("price") < 30000, "Low")
                 .when(col("price").between(30000, 50000), "Medium")
                 .otherwise("High"))

df_chg.show(5)
df_chg.printSchema()

+--------+-----------+-------------+---------+-------+-----------+-----+------------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|price|  order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+------------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|45000|  2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|32000|  05/01/2024|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|55000|  2024/01/06|   Completed|       64900.0|          High|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|Electronics|41277|  2024-01-07|   Cancelled|      48706.86|        Medium|
|  ORD005|       C005|         Neha|  Chennai| Laptop|Electronics|48000|invalid_date|   Completed

In [57]:
#7
from pyspark.sql.functions import udf
from pyspark.sql.types import DateType
from datetime import datetime


from pyspark.sql.functions import coalesce, to_date


from pyspark.sql.functions import udf, col, trim
from pyspark.sql.types import DateType
from datetime import datetime

def parse_date_py(date_str):
    if date_str is None:
        return None
    date_str = date_str.strip()
    formats = ["%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%Y/%m/%d"]
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt).date()
        except ValueError:
            continue
    return None

parse_date_udf = udf(parse_date_py, DateType())

df_chg = df_chg.withColumn("order_date", parse_date_udf(col("order_date")))

df_chg.show(truncate=False)
df_chg.printSchema()

+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|order_id|customer_id|customer_name|city     |product|category   |price|order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|ORD001  |C001       |Ravi         |Delhi    |Laptop |Electronics|45000|2024-01-05|Completed   |53100.0       |Medium        |
|ORD002  |C002       |Sneha        |Mumbai   |Mobile |Electronics|32000|2024-05-01|Completed   |37760.0       |Medium        |
|ORD003  |C003       |Aman         |Bangalore|Laptop |Electronics|55000|2024-01-06|Completed   |64900.0       |High          |
|ORD004  |C004       |Pooja        |Delhi    |Tablet |Electronics|41277|2024-01-07|Cancelled   |48706.86      |Medium        |
|ORD005  |C005       |Neha         |Chennai  |Laptop |Electronics|48000|NULL      |Completed   |56640.0       |

In [58]:
#8
df_chg = df_chg.dropDuplicates(["order_id"])
df_chg.show()

+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|price|order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|45000|2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|32000|2024-05-01|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|55000|2024-01-06|   Completed|       64900.0|          High|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|Electronics|41277|2024-01-07|   Cancelled|      48706.86|        Medium|
|  ORD005|       C005|         Neha|  Chennai| Laptop|Electronics|48000|      NULL|   Completed|       56640.0|

In [59]:
#9
df_chg=df_chg.filter(col("order_status")=="Completed")
df_chg.show()

+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|price|order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|45000|2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|32000|2024-05-01|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|55000|2024-01-06|   Completed|       64900.0|          High|
|  ORD005|       C005|         Neha|  Chennai| Laptop|Electronics|48000|      NULL|   Completed|       56640.0|        Medium|
|  ORD006|       C006|        Rahul|   Mumbai| Mobile|Electronics|41277|2024-01-08|   Completed|      48706.86|

In [60]:
#10
from pyspark.sql.functions import year,month
df_chg=df_chg.withColumn("order_year",year(col("order_date"))).withColumn("order_month",month(col("order_date")))
df_chg.show()

+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+----------+-----------+
|order_id|customer_id|customer_name|     city|product|   category|price|order_date|order_status|price_with_tax|price_category|order_year|order_month|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+----------+-----------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|45000|2024-01-05|   Completed|       53100.0|        Medium|      2024|          1|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|32000|2024-05-01|   Completed|       37760.0|        Medium|      2024|          5|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|55000|2024-01-06|   Completed|       64900.0|          High|      2024|          1|
|  ORD005|       C005|         Neha|  Chennai| Laptop|Electronics|48000|      NULL|   Completed|    

In [61]:
#11
from pyspark.sql.functions import sum
revenue_city=df_chg.groupBy("city").agg(sum("price").alias("total_revenue"))
revenue_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|       203000|
|  Chennai|       131000|
|   Mumbai|       236554|
|     Pune|        28000|
|    Delhi|       172000|
+---------+-------------+



In [62]:
#12
revenue_product=df_chg.groupBy("product").agg(sum("price").alias("total_revenue"))
revenue_product.show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       418000|
| Mobile|       228554|
| Tablet|       124000|
+-------+-------------+



In [63]:
#13
top_cities=revenue_city.orderBy(col("total_revenue").desc()).limit(3)
top_cities.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|   Mumbai|       236554|
|Bangalore|       203000|
|    Delhi|       172000|
+---------+-------------+



In [64]:
#14
avg_price_threshold=40000
premium_products=df_chg.groupBy("product").agg(avg("price").alias("average_price")).filter(col("average_price")> avg_price_threshold)
premium_products=premium_products.orderBy(col("average_price").desc())
premium_products.show()

+-------+-------------+
|product|average_price|
+-------+-------------+
| Laptop|      52250.0|
+-------+-------------+



In [66]:
#15
df_chg=df_chg.repartition(2)
df_chg.write.mode("overwrite").parquet("cleaned_data.parquet")

In [67]:
#16
parquet_df=spark.read.parquet("cleaned_data.parquet")
parquet_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_status: string (nullable = true)
 |-- price_with_tax: double (nullable = true)
 |-- price_category: string (nullable = true)
 |-- order_year: integer (nullable = true)
 |-- order_month: integer (nullable = true)



In [70]:
#17
df_chg.write.mode("overwrite").orc("cleaned_data.orc")

In [71]:
#19
df_chg.rdd.getNumPartitions()

2

In [72]:
#20
df_repartitioned = df_chg.repartition(4)
df_repartitioned.write.mode("overwrite").orc("cleaned_data_repartitioned.orc")
print(f"DataFrame repartitioned to {df_repartitioned.rdd.getNumPartitions()} partitions and written to 'cleaned_data_repartitioned.orc'")

DataFrame repartitioned to 4 partitions and written to 'cleaned_data_repartitioned.orc'


In [73]:
#21
spark.read.parquet("cleaned_data.parquet").inputFiles()
spark.read.orc("cleaned_data.orc").inputFiles()

['file:///content/cleaned_data.orc/part-00000-ef54b117-374d-43aa-90b2-b3a2ec389c00-c000.zstd.orc',
 'file:///content/cleaned_data.orc/part-00001-ef54b117-374d-43aa-90b2-b3a2ec389c00-c000.zstd.orc']

In [74]:
#22
df_chg.explain(True)

== Parsed Logical Plan ==
Repartition 2, true
+- Repartition 2, true
   +- Project [order_id#1089, customer_id#1090, customer_name#1091, city#2806, product#2807, category#2808, price#2899, order_date#2937, order_status#2809, price_with_tax#2900, price_category#2901, order_year#3235, month(order_date#2937) AS order_month#3236]
      +- Project [order_id#1089, customer_id#1090, customer_name#1091, city#2806, product#2807, category#2808, price#2899, order_date#2937, order_status#2809, price_with_tax#2900, price_category#2901, year(order_date#2937) AS order_year#3235]
         +- Filter (order_status#2809 = Completed)
            +- Deduplicate [order_id#1089]
               +- Project [order_id#1089, customer_id#1090, customer_name#1091, city#2806, product#2807, category#2808, price#2899, parse_date_py(order_date#1096)#2936 AS order_date#2937, order_status#2809, price_with_tax#2900, price_category#2901]
                  +- Project [order_id#1089, customer_id#1090, customer_name#1091, cit