In [1]:
import findspark
findspark.init("/opt/manual/spark")
from pyspark.sql import SparkSession, functions as F

In [2]:
spark = (SparkSession.builder
         .master("yarn")
         .appName("Adaptive Query Execution")
         .enableHiveSupport()
         .getOrCreate())

# Read data

In [3]:
market5 = spark.read.format("parquet") \
.load("/user/train/datasets/market5mil_parquet") \
.orderBy(F.rand())

In [4]:
market5.limit(3).toPandas()

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,CLIENTNAME,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE
0,787487,1,20867,YERLI BALDO PIRINC,199688,2017-04-24,,5.75,5.78,5.35,...,Muhammed Yusuf KENDİR,167,BAKLİYAT,GIDA,BAKLİYAT,AÇIK BAKLİYAT,2017-04-25 13:12:17,2017-04-25 13:12:54,E,2018-07-14 02:21:02
1,2001083,1,6966,ETI BROWNI KAKAO KEK 40GR,481794,2017-08-28,1.0,0.5,0.5,0.46,...,,44,ETİ,GIDA,BÜSKİVİ ÇEREZ,KEK,2017-08-29 10:12:21,2017-08-29 10:12:31,,NaT
2,1885564,1,5705,DOLMALIK BIBER,454131,2017-08-24,,2.6,1.79,1.66,...,Semra DİLLİ,A25,HAL,MEYVE SEBZE,SEBZE,,2017-08-25 20:16:57,2017-08-25 20:18:20,K,2018-07-14 02:01:30


In [5]:
market5.printSchema()

root
 |-- LOGICALREF: integer (nullable = true)
 |-- COUNT_: integer (nullable = true)
 |-- ITEMCODE: string (nullable = true)
 |-- ITEMNAME: string (nullable = true)
 |-- FICHENO: string (nullable = true)
 |-- DATE_: timestamp (nullable = true)
 |-- AMOUNT: integer (nullable = true)
 |-- PRICE: float (nullable = true)
 |-- LINENETTOTAL: float (nullable = true)
 |-- LINENET: float (nullable = true)
 |-- BRANCHNR: string (nullable = true)
 |-- BRANCH: string (nullable = true)
 |-- SALESMAN: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- REGION: string (nullable = true)
 |-- LATITUDE: float (nullable = true)
 |-- LONGITUDE: float (nullable = true)
 |-- CLIENTCODE: string (nullable = true)
 |-- CLIENTNAME: string (nullable = true)
 |-- BRANDCODE: string (nullable = true)
 |-- BRAND: string (nullable = true)
 |-- CATEGORY_NAME1: string (nullable = true)
 |-- CATEGORY_NAME2: string (nullable = true)
 |-- CATEGORY_NAME3: string (nullable = true)
 |-- STARTDATE: timestamp (

# spark.sql.adaptive.enabled

In [6]:
# One of the most popular tuning tips
# Default number is 200
# If too small: GC pressure disk spilling
# If to large: Inefficient I/O; scheduler pressure
# Hard tune over the whole query plan

In [6]:
spark.conf.get("spark.sql.adaptive.enabled")

'false'

## With Adaptive query enabled

In [7]:
spark.conf.set("spark.sql.adaptive.enabled", True)

In [8]:
spark.conf.get("spark.sql.adaptive.enabled")

'true'

In [9]:
import time

start_time = time.time()

market5.groupBy("CITY","BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(20) \
.toPandas()



print("----- %s secs -----" %(time.time() - start_time))

----- 23.076776266098022 secs -----


##  Without Adaptive query enabled

In [10]:
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.get("spark.sql.adaptive.enabled")

'false'

In [11]:
import time

start_time = time.time()

market5.groupBy("CITY","BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(20) \
.toPandas()



print("----- %s secs -----" %(time.time() - start_time))

----- 31.766934633255005 secs -----


## Execute AQE with bucketing+partitioning and 8

In [12]:
spark.conf.set("spark.sql.adaptive.enabled", True)
spark.conf.get("spark.sql.adaptive.enabled")

'true'

In [13]:
spark.conf.set("spark.sql.shuffle.partitions", 8)
spark.conf.get("spark.sql.shuffle.partitions")

'8'

In [14]:
market5_pby_region = spark.sql("select * from market5_pby_region")

In [15]:
market5_pby_region.limit(3).toPandas()

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE,REGION
0,4757121,1,1903,"COCA COLA 2,5 LT. EKO BOY",617176,2018-12-08,1.0,3.45,3.45,3.19,...,30,COCO COLA,İÇECEK,GAZLI İÇECEK,COLA,2018-12-09 19:48:31,2018-12-09 19:49:11,K,2018-07-14 02:25:02,Marmara
1,145836,1,18920,WINNER SLIMS,50248,2017-01-26,1.0,5.0,5.0,5.0,...,231,VİGOR,SİGARA,,,2017-01-27 14:24:33,2017-01-27 14:24:35,E,2018-07-14 02:24:06,Marmara
2,2067184,1,3117,PINAR MANGAL SUCUK,497754,2017-09-01,,29.75,3.27,3.03,...,118,PINAR,ET TAVUK,ET ŞARKÜTERİ,SUCUK,2017-09-02 11:03:50,2017-09-02 11:04:27,K,2018-07-14 02:16:24,Marmara


In [16]:
import time

start_time = time.time()

market5_pby_region.groupBy("CITY","BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(20) \
.toPandas()



print("----- %s secs -----" %(time.time() - start_time))

----- 5.5244200229644775 secs -----


In [18]:
spark.stop()