In [1]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("Catalyst Optimizer Example").getOrCreate()


data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
df = spark.createDataFrame(data, ["Name", "Age"])


result = df.filter(df.Age > 30).select("Name", "Age").orderBy("Name")

# Показываем логический и физический планы
result.explain(True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/29 14:44:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


== Parsed Logical Plan ==
'Sort ['Name ASC NULLS FIRST], true
+- Project [Name#0, Age#1L]
   +- Filter (Age#1L > cast(30 as bigint))
      +- LogicalRDD [Name#0, Age#1L], false

== Analyzed Logical Plan ==
Name: string, Age: bigint
Sort [Name#0 ASC NULLS FIRST], true
+- Project [Name#0, Age#1L]
   +- Filter (Age#1L > cast(30 as bigint))
      +- LogicalRDD [Name#0, Age#1L], false

== Optimized Logical Plan ==
Sort [Name#0 ASC NULLS FIRST], true
+- Filter (isnotnull(Age#1L) AND (Age#1L > 30))
   +- LogicalRDD [Name#0, Age#1L], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [Name#0 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(Name#0 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=12]
      +- Filter (isnotnull(Age#1L) AND (Age#1L > 30))
         +- Scan ExistingRDD[Name#0,Age#1L]



In [2]:
spark.stop()

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("Tungsten Example") \
    .config("spark.sql.codegen.wholeStage", "true") \
    .getOrCreate()

data = [(i, f"Name_{i % 5}", i % 3, i % 2) for i in range(1, 1000001)]
df = spark.createDataFrame(data, ["id", "name", "mod3", "mod2"])

# Применяем фильтр и агрегацию, чтобы увидеть работу Tungsten
result = df.filter(col("mod2") == 1) \
           .groupBy("mod3") \
           .agg({"id": "sum"}) \
           .orderBy("mod3")

result.explain(True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/29 18:55:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


== Parsed Logical Plan ==
'Sort ['mod3 ASC NULLS FIRST], true
+- Aggregate [mod3#2L], [mod3#2L, sum(id#0L) AS sum(id)#13L]
   +- Filter (mod2#3L = cast(1 as bigint))
      +- LogicalRDD [id#0L, name#1, mod3#2L, mod2#3L], false

== Analyzed Logical Plan ==
mod3: bigint, sum(id): bigint
Sort [mod3#2L ASC NULLS FIRST], true
+- Aggregate [mod3#2L], [mod3#2L, sum(id#0L) AS sum(id)#13L]
   +- Filter (mod2#3L = cast(1 as bigint))
      +- LogicalRDD [id#0L, name#1, mod3#2L, mod2#3L], false

== Optimized Logical Plan ==
Sort [mod3#2L ASC NULLS FIRST], true
+- Aggregate [mod3#2L], [mod3#2L, sum(id#0L) AS sum(id)#13L]
   +- Project [id#0L, mod3#2L]
      +- Filter (isnotnull(mod2#3L) AND (mod2#3L = 1))
         +- LogicalRDD [id#0L, name#1, mod3#2L, mod2#3L], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [mod3#2L ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(mod3#2L ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=26]
      +- HashAggregate(keys=[mod3#2L

In [2]:
spark.stop()