In [1]:
import findspark
findspark.init("/opt/manual/spark")
from pyspark.sql import SparkSession, functions as F

In [2]:
spark = (
    SparkSession.builder
    .appName("Adaptive Query Execution")
    .master("yarn")
    .enableHiveSupport()
    .getOrCreate())

# Read Data

In [3]:
market5 = spark.read.format("parquet") \
.load("/user/train/datasets/market5mil_parquet") \
.orderBy(F.rand())

In [4]:
market5.limit(3).toPandas()

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,CLIENTNAME,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE
0,3292899,1,1803,CAFE CROWN KOPUKLU SUTLU KAHVE LATTE TEKLI,336963,2018-07-09,1.0,0.5,0.5,0.46,...,Ahmet COŞGEL,146,ÜLKER,İÇECEK,ÇAY KAHVE,KAHVE,2018-07-10 16:30:13,2018-07-10 16:30:26,E,2018-07-14 02:08:40
1,1812914,1,3094,PINAR HINDI SALAM KG,436896,2017-08-18,,24.75,6.19,5.73,...,Elife YAMAN,118,PINAR,ET TAVUK,ET ŞARKÜTERİ,SALAM,2017-08-19 14:32:15,2017-08-19 14:33:03,K,2018-07-14 02:03:52
2,3997491,1,3863,ETI CANGA 50GR,263942,2018-05-31,1.0,0.75,0.75,0.7,...,Aras ERBEYİ,44,ETİ,GIDA,BÜSKİVİ ÇEREZ,BÜSKİVİ,2018-06-01 09:00:30,2018-06-01 09:00:33,E,2018-07-14 02:25:00


# spark.sql.adaptive.enabled

In [5]:
spark.conf.set("spark.sql.adaptive.enabled", True)

In [6]:
import time 
start_time = time.time()

market5.groupBy("CITY", "BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(10) \
.toPandas()

print("--- %s seconds ---" %(time.time()- start_time))

--- 82.82320880889893 seconds ---


# without enabled

In [7]:
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.get("spark.sql.adaptive.enabled")

'false'

In [8]:
import time 
start_time = time.time()

market5.groupBy("CITY", "BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(10) \
.toPandas()

print("--- %s seconds ---" %(time.time()- start_time))

--- 46.57761883735657 seconds ---


## Execute AQE with bucketing+partitioning and 8

In [9]:
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.get("spark.sql.adaptive.enabled")

'false'

In [10]:
spark.conf.set("spark.sql.shuffle.partitions", 8)
spark.conf.get("spark.sql.shuffle.partitions")

'8'

In [13]:
market5mil_pby_region = spark.sql("select * from market5mil_pby_region")

In [14]:
market5mil_pby_region.limit(3).toPandas()

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE,REGION
0,210147,1,2282,NESTLE PASTANEM KOSTEBEK 120,64854,2017-02-04,1.0,4.25,4.25,3.94,...,97,NESTLE,GIDA,ÇİKOLATA GOFRET,,2017-02-05 15:24:41,2017-02-05 15:24:53,E,2018-07-14 01:45:19,Akdeniz
1,41370,1,5694,DOMATES,25356,2017-01-11,,2.5,2.76,2.56,...,A25,HAL,MEYVE SEBZE,SEBZE,,2017-01-12 18:24:05,2017-01-12 18:25:16,E,2018-07-14 01:58:04,Akdeniz
2,192242,1,11,TRABZON&KOY EKMEGI,60978,2017-02-02,1.0,3.0,3.0,2.97,...,,,GIDA,UNLU MAMÜLLER,EKMEK,2017-02-03 10:57:12,2017-02-03 10:58:17,K,2018-07-14 02:19:40,Akdeniz


In [16]:
import time 
start_time = time.time()

market5mil_pby_region.groupBy("CITY", "BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(20) \
.toPandas()

print("--- %s seconds ---" %(time.time()- start_time))

--- 4.664134979248047 seconds ---


In [17]:
spark.stop()