In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("Read CSV Example")\
    .getOrCreate()

In [3]:
data = [
    ("O001","Hyderabad","Electronics",1200,"Delivered"),
    ("O002","Delhi","Clothing",800,"Delivered"),
    ("O003","Mumbai","Electronics",1500,"Cancelled"),
    ("O004","Bangalore","Grocery",400,"Delivered"),
    ("O005","Hyderabad","Grocery",300,"Delivered"),
    ("O006","Delhi","Electronics",2000,"Delivered"),
    ("O007","Mumbai","Clothing",700,"Delivered"),
    ("O008","Bangalore","Electronics",1800,"Delivered"),
    ("O009","Delhi","Grocery",350,"Cancelled"),
    ("O010","Hyderabad","Clothing",900,"Delivered")
]
columns=["order_id","city","category","order_amount","status"]
df=spark.createDataFrame(data,columns)
df.show()
df.printSchema()


+--------+---------+-----------+------------+---------+
|order_id|     city|   category|order_amount|   status|
+--------+---------+-----------+------------+---------+
|    O001|Hyderabad|Electronics|        1200|Delivered|
|    O002|    Delhi|   Clothing|         800|Delivered|
|    O003|   Mumbai|Electronics|        1500|Cancelled|
|    O004|Bangalore|    Grocery|         400|Delivered|
|    O005|Hyderabad|    Grocery|         300|Delivered|
|    O006|    Delhi|Electronics|        2000|Delivered|
|    O007|   Mumbai|   Clothing|         700|Delivered|
|    O008|Bangalore|Electronics|        1800|Delivered|
|    O009|    Delhi|    Grocery|         350|Cancelled|
|    O010|Hyderabad|   Clothing|         900|Delivered|
+--------+---------+-----------+------------+---------+

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- order_amount: long (nullable = true)
 |-- status: string (nullable = true)



In [4]:
df.rdd.getNumPartitions()

2

In [5]:
df_repeat=df.repartition(4)
df_repeat.rdd.getNumPartitions()

4

In [6]:
df_coalesce=df_repeat.coalesce(1)
df_coalesce.rdd.getNumPartitions()

1

In [8]:
filtered_df=df.filter(df.city=="Delhi")
selected_df=filtered_df.select("order_id","order_amount")

In [9]:
selected_df.show()

+--------+------------+
|order_id|order_amount|
+--------+------------+
|    O002|         800|
|    O006|        2000|
|    O009|         350|
+--------+------------+



In [10]:
df_lineage=(
    df.filter(df.status=="Delivered")
      .filter(df.order_amount>500)
      .select("city","order_amount")
)

In [11]:
df_lineage.count()

6

In [12]:
df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Analyzed Logical Plan ==
order_id: string, city: string, category: string, order_amount: bigint, status: string
LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Optimized Logical Plan ==
LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Physical Plan ==
*(1) Scan ExistingRDD[order_id#0,city#1,category#2,order_amount#3L,status#4]



LogicalRDD means the DataFrame was created from an existing RDD.
Analyzed logicalPlan:Confirms schema: order_amount is bigint (because you used LongType or inferred from RDD).
 Optimized Logical Plan:
 Spark applies optimizations (e.g., constant folding, predicate pushdown).
Here, it’s the same as parsed because there’s no filter or aggregation yet.
physical plan:Scan ExistingRDD means Spark will read from the in-memory RDD.
*(1) indicates a single stage in the execution plan.

In [13]:
orders_data = [
    ("O001","Hyderabad",1200),
    ("O002","Delhi",800),
    ("O003","Mumbai",1500),
    ("O004","Bangalore",400),
    ("O005","Hyderabad",300),
    ("O006","Delhi",2000),
    ("O007","Mumbai",700),
    ("O008","Bangalore",1800),
    ("O009","Delhi",350),
    ("O010","Hyderabad",900)
]

orders_cols = ["order_id","city","order_amount"]

orders_df = spark.createDataFrame(orders_data, orders_cols)
orders_df.show()


city_data = [
    ("Hyderabad","Tier-1"),
    ("Delhi","Tier-1"),
    ("Mumbai","Tier-1"),
    ("Bangalore","Tier-1")
]

city_cols = ["city","city_category"]

city_df = spark.createDataFrame(city_data, city_cols)
city_df.show()

+--------+---------+------------+
|order_id|     city|order_amount|
+--------+---------+------------+
|    O001|Hyderabad|        1200|
|    O002|    Delhi|         800|
|    O003|   Mumbai|        1500|
|    O004|Bangalore|         400|
|    O005|Hyderabad|         300|
|    O006|    Delhi|        2000|
|    O007|   Mumbai|         700|
|    O008|Bangalore|        1800|
|    O009|    Delhi|         350|
|    O010|Hyderabad|         900|
+--------+---------+------------+

+---------+-------------+
|     city|city_category|
+---------+-------------+
|Hyderabad|       Tier-1|
|    Delhi|       Tier-1|
|   Mumbai|       Tier-1|
|Bangalore|       Tier-1|
+---------+-------------+



In [16]:
from pyspark.sql.functions import col
filtered_orders = orders_df.filter(col("order_amount") > 500)

joined_df = filtered_orders.join(
    city_df,
    on="city",
    how="inner"
)

final_df = joined_df.select(
    "order_id",
    "city",
    "city_category",
    "order_amount"
)


In [17]:
final_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'city_category, 'order_amount]
+- Project [city#35, order_id#34, order_amount#36L, city_category#48]
   +- Join Inner, (city#35 = city#47)
      :- Filter (order_amount#36L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#34, city#35, order_amount#36L], false
      +- LogicalRDD [city#47, city_category#48], false

== Analyzed Logical Plan ==
order_id: string, city: string, city_category: string, order_amount: bigint
Project [order_id#34, city#35, city_category#48, order_amount#36L]
+- Project [city#35, order_id#34, order_amount#36L, city_category#48]
   +- Join Inner, (city#35 = city#47)
      :- Filter (order_amount#36L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#34, city#35, order_amount#36L], false
      +- LogicalRDD [city#47, city_category#48], false

== Optimized Logical Plan ==
Project [order_id#34, city#35, city_category#48, order_amount#36L]
+- Join Inner, (city#35 = city#47)
   :- Filter ((isnotnull(orde

In [20]:
from pyspark.sql.functions import broadcast
broadcast_join_df=filtered_orders.join(
    broadcast(city_df),
    on="city",
    how="inner"
)
final_broadcast_df=broadcast_join_df.select(
    "order_id",
    "city",
    "city_category",
    "order_amount"
)

In [21]:
final_broadcast_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'city_category, 'order_amount]
+- Project [city#35, order_id#34, order_amount#36L, city_category#48]
   +- Join Inner, (city#35 = city#47)
      :- Filter (order_amount#36L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#34, city#35, order_amount#36L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#47, city_category#48], false

== Analyzed Logical Plan ==
order_id: string, city: string, city_category: string, order_amount: bigint
Project [order_id#34, city#35, city_category#48, order_amount#36L]
+- Project [city#35, order_id#34, order_amount#36L, city_category#48]
   +- Join Inner, (city#35 = city#47)
      :- Filter (order_amount#36L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#34, city#35, order_amount#36L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#47, city_category#48], false

== Optimized Logical Plan ==
Project [order_id#34, city#35, city_cat

In [25]:
rides_data = [
("R001","U001","Hyderabad",12.5,240,"Completed"),
("R002","U002","Delhi",8.2,180,"Completed"),
("R003","U003","Mumbai",15.0,300,"Cancelled"),
("R004","U004","Bangalore",5.5,120,"Completed"),
("R005","U005","Hyderabad",20.0,360,"Completed"),
("R006","U006","Delhi",25.0,420,"Completed"),
("R007","U007","Mumbai",7.5,150,"Completed"),
("R008","U008","Bangalore",18.0,330,"Completed"),
("R009","U009","Delhi",6.0,140,"Cancelled"),
("R010","U010","Hyderabad",10.0,200,"Completed")
]
rides_cols = [
"ride_id",
"user_id",
"city",
"distance_km",
"duration_seconds",
"status"
]

rides_df = spark.createDataFrame(rides_data, rides_cols)

#DATASET 2 — CITY SURGE MULTIPLIERS (Small Lookup)

surge_data = [
("Hyderabad",1.2),
("Delhi",1.5),
("Mumbai",1.8),
("Bangalore",1.3)
]
surge_cols = ["city","surge_multiplier"]
surge_df = spark.createDataFrame(surge_data, surge_cols)

In [27]:

from pyspark.sql.functions import col
filtered_df = rides_df.filter(col("status") == "Completed")
selected_df = filtered_df.select("ride_id", "city", "distance_km")

In [28]:
filtered_df.show()

+-------+-------+---------+-----------+----------------+---------+
|ride_id|user_id|     city|distance_km|duration_seconds|   status|
+-------+-------+---------+-----------+----------------+---------+
|   R001|   U001|Hyderabad|       12.5|             240|Completed|
|   R002|   U002|    Delhi|        8.2|             180|Completed|
|   R004|   U004|Bangalore|        5.5|             120|Completed|
|   R005|   U005|Hyderabad|       20.0|             360|Completed|
|   R006|   U006|    Delhi|       25.0|             420|Completed|
|   R007|   U007|   Mumbai|        7.5|             150|Completed|
|   R008|   U008|Bangalore|       18.0|             330|Completed|
|   R010|   U010|Hyderabad|       10.0|             200|Completed|
+-------+-------+---------+-----------+----------------+---------+



In [33]:

filtered_df = (
    rides_df
    .filter(rides_df.status == "Completed")       # Filter 1
       .filter(rides_df.distance_km > 5)             # Filter 2
    .select("ride_id", "city", "distance_km")     # Column selection
)


In [32]:
cities_df=df
joined_df = rides_df.join(cities_df, "city") \
         .filter(rides_df.status == "Completed")

