# EXPLAIN PLAN

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/27 23:46:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_employee = [(1, "Carl Mike", "m", 170000, 1),
                 (2, "Mikel Clark", "m", 254300, 2),
                 (3, "Bob Smith", "m", 220000, 3),
                 (4, "Mary Scala", "f", 230000, 1),
                 (5, "Susan Liam", "f", 150000, 1),
                 (6, "Xi Wuan", "f", 150000, 2),
                 (7, "Kyla Stewart", "f", 185000, 2),
                 (8, "Mia Lebrin", "f", 242000, 1)]


schema_employee = "id INTEGER, name STRING, gender STRING, salary INTEGER, dept_id INTEGER"

In [4]:
df_employee = spark.createDataFrame(data=data_employee, schema=schema_employee)

df_employee.show()

                                                                                

+---+------------+------+------+-------+
| id|        name|gender|salary|dept_id|
+---+------------+------+------+-------+
|  1|   Carl Mike|     m|170000|      1|
|  2| Mikel Clark|     m|254300|      2|
|  3|   Bob Smith|     m|220000|      3|
|  4|  Mary Scala|     f|230000|      1|
|  5|  Susan Liam|     f|150000|      1|
|  6|     Xi Wuan|     f|150000|      2|
|  7|Kyla Stewart|     f|185000|      2|
|  8|  Mia Lebrin|     f|242000|      1|
+---+------------+------+------+-------+



In [5]:
data_department = [(1, "IT", 1),
                 (2, "Sales", 2),
                 (3, "HR", 2)]


schema_department = "id INTEGER, dept STRING, floor INTEGER"

In [6]:
df_department = spark.createDataFrame(data=data_department, schema=schema_department)

df_department.show()

+---+-----+-----+
| id| dept|floor|
+---+-----+-----+
|  1|   IT|    1|
|  2|Sales|    2|
|  3|   HR|    2|
+---+-----+-----+



In [7]:
from pyspark.sql.functions import col, round, avg

In [8]:
df_join = df_employee.join(df_department, df_employee.dept_id == df_department.id, "inner") \
    .withColumn("bonus",col("salary")*0.1) \
    .groupBy("dept").agg(round(avg("salary"),2)).alias("average_salary")

df_join.show()

[Stage 5:>                                                          (0 + 4) / 4]

+-----+---------------------+
| dept|round(avg(salary), 2)|
+-----+---------------------+
|Sales|            196433.33|
|   HR|             220000.0|
|   IT|             198000.0|
+-----+---------------------+



                                                                                

## Explain Plan

In [9]:
df_join.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[dept#32], functions=[avg(salary#3)])
   +- Exchange hashpartitioning(dept#32, 200), ENSURE_REQUIREMENTS, [plan_id=287]
      +- HashAggregate(keys=[dept#32], functions=[partial_avg(salary#3)])
         +- Project [salary#3, dept#32]
            +- SortMergeJoin [dept_id#4], [id#31], Inner
               :- Sort [dept_id#4 ASC NULLS FIRST], false, 0
               :  +- Exchange hashpartitioning(dept_id#4, 200), ENSURE_REQUIREMENTS, [plan_id=279]
               :     +- Project [salary#3, dept_id#4]
               :        +- Filter isnotnull(dept_id#4)
               :           +- Scan ExistingRDD[id#0,name#1,gender#2,salary#3,dept_id#4]
               +- Sort [id#31 ASC NULLS FIRST], false, 0
                  +- Exchange hashpartitioning(id#31, 200), ENSURE_REQUIREMENTS, [plan_id=280]
                     +- Project [id#31, dept#32]
                        +- Filter isnotnull(id#31)
                      

## Plan Extended

In [10]:
df_join.explain(extended=True)

== Parsed Logical Plan ==
SubqueryAlias average_salary
+- Aggregate [dept#32], [dept#32, round(avg(salary#3), 2) AS round(avg(salary), 2)#86]
   +- Project [id#0, name#1, gender#2, salary#3, dept_id#4, id#31, dept#32, floor#33, (cast(salary#3 as double) * 0.1) AS bonus#66]
      +- Join Inner, (dept_id#4 = id#31)
         :- LogicalRDD [id#0, name#1, gender#2, salary#3, dept_id#4], false
         +- LogicalRDD [id#31, dept#32, floor#33], false

== Analyzed Logical Plan ==
dept: string, round(avg(salary), 2): double
SubqueryAlias average_salary
+- Aggregate [dept#32], [dept#32, round(avg(salary#3), 2) AS round(avg(salary), 2)#86]
   +- Project [id#0, name#1, gender#2, salary#3, dept_id#4, id#31, dept#32, floor#33, (cast(salary#3 as double) * 0.1) AS bonus#66]
      +- Join Inner, (dept_id#4 = id#31)
         :- LogicalRDD [id#0, name#1, gender#2, salary#3, dept_id#4], false
         +- LogicalRDD [id#31, dept#32, floor#33], false

== Optimized Logical Plan ==
Aggregate [dept#32], [dept#

## Simple Plan

In [11]:
df_join.explain(mode="simple")

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[dept#32], functions=[avg(salary#3)])
   +- Exchange hashpartitioning(dept#32, 200), ENSURE_REQUIREMENTS, [plan_id=287]
      +- HashAggregate(keys=[dept#32], functions=[partial_avg(salary#3)])
         +- Project [salary#3, dept#32]
            +- SortMergeJoin [dept_id#4], [id#31], Inner
               :- Sort [dept_id#4 ASC NULLS FIRST], false, 0
               :  +- Exchange hashpartitioning(dept_id#4, 200), ENSURE_REQUIREMENTS, [plan_id=279]
               :     +- Project [salary#3, dept_id#4]
               :        +- Filter isnotnull(dept_id#4)
               :           +- Scan ExistingRDD[id#0,name#1,gender#2,salary#3,dept_id#4]
               +- Sort [id#31 ASC NULLS FIRST], false, 0
                  +- Exchange hashpartitioning(id#31, 200), ENSURE_REQUIREMENTS, [plan_id=280]
                     +- Project [id#31, dept#32]
                        +- Filter isnotnull(id#31)
                      

## Simple Plan

In [12]:
df_join.explain(mode="extended")

== Parsed Logical Plan ==
SubqueryAlias average_salary
+- Aggregate [dept#32], [dept#32, round(avg(salary#3), 2) AS round(avg(salary), 2)#86]
   +- Project [id#0, name#1, gender#2, salary#3, dept_id#4, id#31, dept#32, floor#33, (cast(salary#3 as double) * 0.1) AS bonus#66]
      +- Join Inner, (dept_id#4 = id#31)
         :- LogicalRDD [id#0, name#1, gender#2, salary#3, dept_id#4], false
         +- LogicalRDD [id#31, dept#32, floor#33], false

== Analyzed Logical Plan ==
dept: string, round(avg(salary), 2): double
SubqueryAlias average_salary
+- Aggregate [dept#32], [dept#32, round(avg(salary#3), 2) AS round(avg(salary), 2)#86]
   +- Project [id#0, name#1, gender#2, salary#3, dept_id#4, id#31, dept#32, floor#33, (cast(salary#3 as double) * 0.1) AS bonus#66]
      +- Join Inner, (dept_id#4 = id#31)
         :- LogicalRDD [id#0, name#1, gender#2, salary#3, dept_id#4], false
         +- LogicalRDD [id#31, dept#32, floor#33], false

== Optimized Logical Plan ==
Aggregate [dept#32], [dept#

## Formatted Plan

In [13]:
df_join.explain(mode="formatted")

== Physical Plan ==
AdaptiveSparkPlan (16)
+- HashAggregate (15)
   +- Exchange (14)
      +- HashAggregate (13)
         +- Project (12)
            +- SortMergeJoin Inner (11)
               :- Sort (5)
               :  +- Exchange (4)
               :     +- Project (3)
               :        +- Filter (2)
               :           +- Scan ExistingRDD (1)
               +- Sort (10)
                  +- Exchange (9)
                     +- Project (8)
                        +- Filter (7)
                           +- Scan ExistingRDD (6)


(1) Scan ExistingRDD
Output [5]: [id#0, name#1, gender#2, salary#3, dept_id#4]
Arguments: [id#0, name#1, gender#2, salary#3, dept_id#4], MapPartitionsRDD[4] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0, ExistingRDD, UnknownPartitioning(0)

(2) Filter
Input [5]: [id#0, name#1, gender#2, salary#3, dept_id#4]
Condition : isnotnull(dept_id#4)

(3) Project
Output [2]: [salary#3, dept_id#4]
Input [5]: [id#0, name#1, gender#2, salary#

## Cost Plan

In [14]:
df_join.explain(mode="cost")

== Optimized Logical Plan ==
Aggregate [dept#32], [dept#32, round(avg(salary#3), 2) AS round(avg(salary), 2)#86], Statistics(sizeInBytes=1.81E+37 B)
+- Project [salary#3, dept#32], Statistics(sizeInBytes=1.61E+37 B)
   +- Join Inner, (dept_id#4 = id#31), Statistics(sizeInBytes=2.02E+37 B)
      :- Project [salary#3, dept_id#4], Statistics(sizeInBytes=2.1 EiB)
      :  +- Filter isnotnull(dept_id#4), Statistics(sizeInBytes=8.0 EiB)
      :     +- LogicalRDD [id#0, name#1, gender#2, salary#3, dept_id#4], false, Statistics(sizeInBytes=8.0 EiB)
      +- Project [id#31, dept#32], Statistics(sizeInBytes=7.1 EiB)
         +- Filter isnotnull(id#31), Statistics(sizeInBytes=8.0 EiB)
            +- LogicalRDD [id#31, dept#32, floor#33], false, Statistics(sizeInBytes=8.0 EiB)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[dept#32], functions=[avg(salary#3)], output=[dept#32, round(avg(salary), 2)#86])
   +- Exchange hashpartitioning(dept#32, 200), ENSURE_REQUIREME

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 60606)
Traceback (most recent call last):
  File "/home/andresmunozpampillon/anaconda3/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/home/andresmunozpampillon/anaconda3/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/home/andresmunozpampillon/anaconda3/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/home/andresmunozpampillon/anaconda3/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/home/andresmunozpampillon/spark-3.5.2-bin-hadoop3/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/home/andresmunozpampillon/spark-3.5.2-bin-hadoop3/python/pyspark/accumulators.py", line 267, in poll