In [None]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("Read CSV Example")\
    .getOrCreate()

In [None]:
# Load CSV without schema inference
df = spark.read.option("header", "true").option("inferSchema", "false").csv("orders.csv")

In [None]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



In [None]:
print("Total Records:",df.count())

Total Records: 300000


In [None]:
df.show(5)

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|       Pune|       Home|AirPurifier|  33659|2024-01-05|Completed|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
only showing top 5 rows


In [None]:
from pyspark.sql.functions import trim,initcap,regexp_replace,col,to_date,when
# Trim and proper case
df = df.withColumn("city", initcap(trim(col("city")))) \
       .withColumn("category", initcap(trim(col("category")))) \
       .withColumn("product", initcap(trim(col("product"))))

In [None]:
# Clean amount
df = df.withColumn("amount_clean", regexp_replace(col("amount"), ",", "")) \
       .withColumn("amount_clean", when(col("amount_clean").rlike("^[0-9]+$"), col("amount_clean")).otherwise(None).cast("int"))

In [None]:
from pyspark.sql.functions import to_date, coalesce, col
df=df.withColumn("order_date_clean",
                 coalesce(to_date(col("order_date"),"yyyy-MM-dd"),
                          to_date(col("order_date"),"dd-MM-yyyy"),
                          to_date(col("order_date"),"yyyy/MM/dd"),
                          to_date(col("order_date"),"dd/MM/yyyy")))

Data validation

In [None]:
df.filter(col("amount_clean").isNull()).count()

23905

In [None]:
df.filter(col("order_data_clean").isNull()).count()

In [None]:
#duplicate orders
df.groupBy("order_id").count().filter("count>1")

DataFrame[order_id: string, count: bigint]

In [None]:
df=df.dropDuplicates(["order_id"])

In [None]:
#filter completed orders
df=df.filter(col("status")=="Completed")

In [None]:
#performance check
df.rdd.getNumPartitions()

2

In [None]:
#revenue per city
df.groupBy("city")\
.sum("amount_clean")\
.explain(True)

== Parsed Logical Plan ==
'Aggregate ['city], ['city, unresolvedalias('sum(amount_clean#75))]
+- Filter (status#24 = Completed)
   +- Deduplicate [order_id#17]
      +- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, amount_clean#75, coalesce(to_date(order_date#23, Some(yyyy-MM-dd), Some(Etc/UTC), true), to_date(order_date#23, Some(dd-MM-yyyy), Some(Etc/UTC), true), to_date(order_date#23, Some(yyyy/MM/dd), Some(Etc/UTC), true)) AS order_date_clean#112]
         +- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, amount_clean#75, CASE WHEN isnull(order_date_clean#110) THEN to_date(order_date#23, Some(yyyy/MM/dd), Some(Etc/UTC), true) ELSE order_date_clean#110 END AS order_date_clean#111]
            +- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, amount_clean#75, CASE WHEN isnull(order_date_clean#109) THE

In [None]:
#repartition
df=df.repartition("city")

In [None]:

#analytics
#total revenue per city
df.groupBy("city").sum("amount_clean").show()
#total revenue per category
df.groupBy("category").sum("amount_clean").show()
#average order value
df.groupBy("city").avg("amount_clean").show()
#Top 10 products
df.groupBy("product").sum("amount_clean")\
.orderBy("sum(amount_clean)",ascending=False)\
.show(10)

+---------+-----------------+
|     city|sum(amount_clean)|
+---------+-----------------+
|Bangalore|       1628527093|
|  Chennai|       1629865247|
|   Mumbai|       1625518096|
|  Kolkata|       1624300497|
|     Pune|       1646196535|
|    Delhi|       1639639916|
|Hyderabad|       1642443340|
+---------+-----------------+

+-----------+-----------------+
|   category|sum(amount_clean)|
+-----------+-----------------+
|       Home|       2868467576|
|    Fashion|       2834182172|
|    Grocery|       2866272106|
|Electronics|       2867568870|
+-----------+-----------------+

+---------+------------------+
|     city| avg(amount_clean)|
+---------+------------------+
|Bangalore|44098.867908689645|
|  Chennai| 43628.27900315863|
|   Mumbai| 43723.75651612556|
|  Kolkata|43709.816662630175|
|     Pune|43930.204013556424|
|    Delhi| 43817.20780331374|
|Hyderabad| 43708.74045293664|
+---------+------------------+

+-----------+-----------------+
|    product|sum(amount_clean)|
+-----

In [None]:
#window function
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

city_revenue=df.groupBy("city").sum("amount_clean")
w=Window.orderBy(col("sum(amount_clean)").desc())
city_revenue.withColumn("rank",rank().over(w))

DataFrame[city: string, sum(amount_clean): bigint, rank: int]

In [None]:
#Broadcast join
city_data=[
    ("Delhi","North"),("Mumbai","West"),("Bangalore","South"),
    ("Hyderabad","South"),("Pune","West"),
    ("Chennai","South"),("Kolkata","east")
]
city_df=spark.createDataFrame(city_data,["city","region"])
from pyspark.sql.functions import broadcast
df.join(broadcast(city_df),"city","left").show()

+---------+-----------+-----------+-----------+-----------+-------+----------+---------+------------+----------------+------+
|     city|   order_id|customer_id|   category|    product| amount|order_date|   status|amount_clean|order_date_clean|region|
+---------+-----------+-----------+-----------+-----------+-------+----------+---------+------------+----------------+------+
|Bangalore|ORD00000008|    C000008|    Fashion|      Jeans|  10563|2024-01-09|Completed|       10563|      2024-01-09| South|
|Bangalore|ORD00000010|    C000010|    Grocery|      Sugar|  66576|2024-01-11|Completed|       66576|      2024-01-11| South|
|Bangalore|ORD00000012|    C000012|    Grocery|      Sugar|  84768|2024-01-13|Completed|       84768|      2024-01-13| South|
|Bangalore|ORD00000017|    C000017|    Grocery|        Oil|  69582|2024-01-18|Completed|       69582|      2024-01-18| South|
|Bangalore|ORD00000024|    C000024|       Home|      Mixer|  18082|2024-01-25|Completed|       18082|      2024-01-25|

In [None]:
df.explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(order_date_clean, 'coalesce('to_date('order_date, yyyy-MM-dd), 'to_date('order_date, dd-MM-yyyy), 'to_date('order_date, yyyy/MM/dd), 'to_date('order_date, dd/MM/yyyy)), None)]
+- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, amount_clean#75, coalesce(to_date(order_date#23, Some(yyyy-MM-dd), Some(Etc/UTC), true), to_date(order_date#23, Some(dd-MM-yyyy), Some(Etc/UTC), true), to_date(order_date#23, Some(yyyy/MM/dd), Some(Etc/UTC), true), to_date(order_date#23, Some(dd/MM/yyyy), Some(Etc/UTC), true)) AS order_date_clean#2618]
   +- RepartitionByExpression [city#71]
      +- Filter (status#24 = Completed)
         +- Deduplicate [order_id#17]
            +- Project [order_id#17, customer_id#18, city#71, category#72, product#73, amount#22, order_date#23, status#24, amount_clean#75, coalesce(to_date(order_date#23, Some(yyyy-MM-dd), Some(Etc/UTC), true), to_date(order_d

In [None]:
#UDF
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def classify(amount):
  if amount is None:
    return None
  elif amount>=80000:
    return "High"
  elif amount>=40000:
    return "Medium"
  else:
    return "Low"
classify_udf=udf(classify,StringType())
df=df.withColumn("order_value_category",classify_udf(col("amount_clean")))

In [None]:
#RDD
rdd=df.rdd
total_revenue=rdd.map(lambda x:x.amount_clean).reduce(lambda a,b:a+b)
orders_per_city=rdd.map(lambda x:(x.city,1))\
         .reduceByKey(lambda a,b:a+b)

In [None]:
#caching
df.cache()
df.groupBy("city").sum("amount_clean").show()
df.groupBy("category").sum("amount_clean").show()
df.unpersist()

In [None]:
df.write.mode("overwrite")\
.partitionBy("city")\
.parquet("clean_orders_parquet")

In [None]:
#debugging
df=df.filter(df.amount_clean>50000)
df.show()