In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Partition Example").getOrCreate()

In [16]:
data = [
    ("O001","Hyderabad","Electronics",1200,"Delivered"),
    ("O002","Delhi","Clothing",800,"Delivered"),
    ("O003","Mumbai","Electronics",1500,"Cancelled"),
    ("O004","Bangalore","Grocery",400,"Delivered"),
    ("O005","Hyderabad","Grocery",300,"Delivered"),
    ("O006","Delhi","Electronics",2000,"Delivered"),
    ("O007","Mumbai","Clothing",700,"Delivered"),
    ("O008","Bangalore","Electronics",1800,"Delivered"),
    ("O009","Delhi","Grocery",350,"Cancelled"),
    ("O010","Hyderabad","Clothing",900,"Delivered")
]

columns = ["order_id","city","category","order_amount","status"]
df = spark.createDataFrame(data,columns)
df.show()
df.printSchema

+--------+---------+-----------+------------+---------+
|order_id|     city|   category|order_amount|   status|
+--------+---------+-----------+------------+---------+
|    O001|Hyderabad|Electronics|        1200|Delivered|
|    O002|    Delhi|   Clothing|         800|Delivered|
|    O003|   Mumbai|Electronics|        1500|Cancelled|
|    O004|Bangalore|    Grocery|         400|Delivered|
|    O005|Hyderabad|    Grocery|         300|Delivered|
|    O006|    Delhi|Electronics|        2000|Delivered|
|    O007|   Mumbai|   Clothing|         700|Delivered|
|    O008|Bangalore|Electronics|        1800|Delivered|
|    O009|    Delhi|    Grocery|         350|Cancelled|
|    O010|Hyderabad|   Clothing|         900|Delivered|
+--------+---------+-----------+------------+---------+



In [3]:
df.rdd.getNumPartitions()

2

In [4]:
df_repart = df.repartition(4)
df_repart.rdd.getNumPartitions()


4

In [7]:
df_coalesce = df_repart.coalesce(1)
df_coalesce.rdd.getNumPartitions()

1

In [17]:
filtered_df = df.filter(df.city == "Delhi")
selected_df = filtered_df.select("order_id","order_amount")

In [18]:
selected_df.show()

+--------+------------+
|order_id|order_amount|
+--------+------------+
|    O002|         800|
|    O006|        2000|
|    O009|         350|
+--------+------------+



In [20]:
df_lineage = ( df.filter(df.status == "Delivered")
.filter(df.order_amount > 500)
.select("city","order_amount"))

In [21]:
df_lineage.count()

6

In [22]:
df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [order_id#49, city#50, category#51, order_amount#52L, status#53], false

== Analyzed Logical Plan ==
order_id: string, city: string, category: string, order_amount: bigint, status: string
LogicalRDD [order_id#49, city#50, category#51, order_amount#52L, status#53], false

== Optimized Logical Plan ==
LogicalRDD [order_id#49, city#50, category#51, order_amount#52L, status#53], false

== Physical Plan ==
*(1) Scan ExistingRDD[order_id#49,city#50,category#51,order_amount#52L,status#53]



In [23]:
df_lineage.explain(True)

== Parsed Logical Plan ==
'Project ['city, 'order_amount]
+- Filter (order_amount#52L > cast(500 as bigint))
   +- Filter (status#53 = Delivered)
      +- LogicalRDD [order_id#49, city#50, category#51, order_amount#52L, status#53], false

== Analyzed Logical Plan ==
city: string, order_amount: bigint
Project [city#50, order_amount#52L]
+- Filter (order_amount#52L > cast(500 as bigint))
   +- Filter (status#53 = Delivered)
      +- LogicalRDD [order_id#49, city#50, category#51, order_amount#52L, status#53], false

== Optimized Logical Plan ==
Project [city#50, order_amount#52L]
+- Filter ((isnotnull(status#53) AND isnotnull(order_amount#52L)) AND ((status#53 = Delivered) AND (order_amount#52L > 500)))
   +- LogicalRDD [order_id#49, city#50, category#51, order_amount#52L, status#53], false

== Physical Plan ==
*(1) Project [city#50, order_amount#52L]
+- *(1) Filter ((isnotnull(status#53) AND isnotnull(order_amount#52L)) AND ((status#53 = Delivered) AND (order_amount#52L > 500)))
   +- *(