In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.appName('Spark Core Concepts')\
.getOrCreate()

In [16]:
data = [
    ("O001","Hyderabad","Electronics",1200,"Delivered"),
    ("O002","Delhi","Clothing",800,"Delivered"),
    ("O003","Mumbai","Electronics",1500,"Cancelled"),
    ("O004","Bangalore","Grocery",400,"Delivered"),
    ("O005","Hyderabad","Grocery",300,"Delivered"),
    ("O006","Delhi","Electronics",2000,"Delivered"),
    ("O007","Mumbai","Clothing",700,"Delivered"),
    ("O008","Bangalore","Electronics",1800,"Delivered"),
    ("O009","Delhi","Grocery",350,"Cancelled"),
    ("O010","Hyderabad","Clothing",900,"Delivered")
]
columns = ["order_id", "city", "category", "order_amount", "status"]
df = spark.createDataFrame(data, columns)
df.show()
df.printSchema()

+--------+---------+-----------+------------+---------+
|order_id|     city|   category|order_amount|   status|
+--------+---------+-----------+------------+---------+
|    O001|Hyderabad|Electronics|        1200|Delivered|
|    O002|    Delhi|   Clothing|         800|Delivered|
|    O003|   Mumbai|Electronics|        1500|Cancelled|
|    O004|Bangalore|    Grocery|         400|Delivered|
|    O005|Hyderabad|    Grocery|         300|Delivered|
|    O006|    Delhi|Electronics|        2000|Delivered|
|    O007|   Mumbai|   Clothing|         700|Delivered|
|    O008|Bangalore|Electronics|        1800|Delivered|
|    O009|    Delhi|    Grocery|         350|Cancelled|
|    O010|Hyderabad|   Clothing|         900|Delivered|
+--------+---------+-----------+------------+---------+

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- order_amount: long (nullable = true)
 |-- status: string (nullable = true)



In [19]:
df.rdd.getNumPartitions()

2

In [20]:
df_repart = df.repartition(4)
df_repart.rdd.getNumPartitions()

4

In [21]:
df_coalesce = df_repart.coalesce(1)
df_coalesce.rdd.getNumPartitions()

1

In [24]:
filtered_df = df.filter(df.city == "Delhi")
selected_df = filtered_df.select("order_id", "order_amount")

In [11]:
selected_df.show() #lazy computation of spark. doesnt compute till it says show.
#computes all actions together to save computation power.

+-------+------------+
|orde_id|order_amount|
+-------+------------+
|   O002|         800|
|   O006|        2000|
|   O009|         350|
+-------+------------+



In [25]:
df_lineage = (
    df.filter(df.status == "Delivered")
    .filter(df.order_amount > 500)
    .select("city", "order_amount")
)
#lineage bunch of transformations in data. multi func same time

In [26]:
df_lineage.count()

6

In [30]:
df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [order_id#41, city#42, category#43, order_amount#44L, status#45], false

== Analyzed Logical Plan ==
order_id: string, city: string, category: string, order_amount: bigint, status: string
LogicalRDD [order_id#41, city#42, category#43, order_amount#44L, status#45], false

== Optimized Logical Plan ==
LogicalRDD [order_id#41, city#42, category#43, order_amount#44L, status#45], false

== Physical Plan ==
*(1) Scan ExistingRDD[order_id#41,city#42,category#43,order_amount#44L,status#45]



to explain what spark is doing
4 stages of execution in pyspark:
1. Parsed Logical Plan: Initial understanding of your code, before validation.
2. Analyzed Logical Plan: Validates the query against the data schema (checks if columns/tables exist).
3. Optimized Logical Plan: Spark applies smart rules to make the query run faster (e.g., filtering early).
4. Physical Plan: The detailed, step-by-step instructions Spark uses to actually run the query on your data.


In [29]:
df_lineage.explain(True)

== Parsed Logical Plan ==
'Project ['city, 'order_amount]
+- Filter (order_amount#44L > cast(500 as bigint))
   +- Filter (status#45 = Delivered)
      +- LogicalRDD [order_id#41, city#42, category#43, order_amount#44L, status#45], false

== Analyzed Logical Plan ==
city: string, order_amount: bigint
Project [city#42, order_amount#44L]
+- Filter (order_amount#44L > cast(500 as bigint))
   +- Filter (status#45 = Delivered)
      +- LogicalRDD [order_id#41, city#42, category#43, order_amount#44L, status#45], false

== Optimized Logical Plan ==
Project [city#42, order_amount#44L]
+- Filter ((isnotnull(status#45) AND isnotnull(order_amount#44L)) AND ((status#45 = Delivered) AND (order_amount#44L > 500)))
   +- LogicalRDD [order_id#41, city#42, category#43, order_amount#44L, status#45], false

== Physical Plan ==
*(1) Project [city#42, order_amount#44L]
+- *(1) Filter ((isnotnull(status#45) AND isnotnull(order_amount#44L)) AND ((status#45 = Delivered) AND (order_amount#44L > 500)))
   +- *(

1. Parsed Logical Plan: Your code's operations (filter, select) are noted.
2. Analyzed Logical Plan: Spark confirms columns and operations are valid.
3. Optimized Logical Plan: Spark combines and reorders operations for efficiency (e.g., merging filters).
4. Physical Plan: The final, step-by-step instructions for execution, including optimized filters and column selection.