In [1]:
import findspark
findspark.init("/opt/manual/spark")
from pyspark.sql import SparkSession, functions as F

In [2]:
spark = (SparkSession.builder
         .master("yarn")
         .appName("Spark Partitioning")
         .enableHiveSupport()
         .getOrCreate())

# Read data

In [3]:
market5 = spark.read.format("parquet") \
.load("/user/train/datasets/market5mil_parquet") \
.orderBy(F.rand())

In [4]:
market5.limit(3).toPandas()

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,CLIENTNAME,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE
0,4423612,1,3254,F BAKTAT ELMA SIRKESI 500 ML,512603,2018-09-17,1,1.9,1.9,1.76,...,Ali Eymen SERDAR,12,F BAKTAT,GIDA,HAZIR YEMEKLER,SİRKE SOS,2018-09-18 18:52:35,2018-09-18 18:54:20,E,2018-07-14 02:19:14
1,4317047,1,242,ULKER FINGER CIK.BITTER 12 GR,445655,2018-08-18,4,0.25,1.0,0.93,...,Sedat GÖZÜBERK,146,ÜLKER,GIDA,ÇİKOLATA GOFRET,,2018-08-19 17:33:30,2018-08-19 17:35:31,E,2018-07-14 02:06:09
2,2899404,1,8222,ETI FORM 45 GR,44173,2018-01-22,2,0.4,0.8,0.74,...,Gözde SEZEN,44,ETİ,GIDA,BÜSKİVİ ÇEREZ,BÜSKİVİ,2018-01-23 10:00:41,2018-01-23 10:00:51,K,2018-07-14 02:23:51


In [5]:
market5.printSchema()

root
 |-- LOGICALREF: integer (nullable = true)
 |-- COUNT_: integer (nullable = true)
 |-- ITEMCODE: string (nullable = true)
 |-- ITEMNAME: string (nullable = true)
 |-- FICHENO: string (nullable = true)
 |-- DATE_: timestamp (nullable = true)
 |-- AMOUNT: integer (nullable = true)
 |-- PRICE: float (nullable = true)
 |-- LINENETTOTAL: float (nullable = true)
 |-- LINENET: float (nullable = true)
 |-- BRANCHNR: string (nullable = true)
 |-- BRANCH: string (nullable = true)
 |-- SALESMAN: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- REGION: string (nullable = true)
 |-- LATITUDE: float (nullable = true)
 |-- LONGITUDE: float (nullable = true)
 |-- CLIENTCODE: string (nullable = true)
 |-- CLIENTNAME: string (nullable = true)
 |-- BRANDCODE: string (nullable = true)
 |-- BRAND: string (nullable = true)
 |-- CATEGORY_NAME1: string (nullable = true)
 |-- CATEGORY_NAME2: string (nullable = true)
 |-- CATEGORY_NAME3: string (nullable = true)
 |-- STARTDATE: timestamp (

# spark.sql.shuffle.partitions

In [None]:
# One of the most popular tuning tips
# Default number is 200
# If too small: GC pressure disk spilling
# If to large: Inefficient I/O; scheduler pressure
# Hard tune over the whole query plan

In [7]:
spark.conf.get("spark.sql.shuffle.partitions")

'200'

## Execute with 8

In [8]:
spark.conf.set("spark.sql.shuffle.partitions", 8)

In [9]:
spark.conf.get("spark.sql.shuffle.partitions")

'8'

In [10]:
import time

start_time = time.time()

market5.groupBy("CITY","BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(20) \
.toPandas()



print("----- %s secs -----" %(time.time() - start_time))

----- 19.872991800308228 secs -----


## Execute with 200

In [11]:
spark.conf.set("spark.sql.shuffle.partitions", 200)
spark.conf.get("spark.sql.shuffle.partitions")

'200'

In [12]:
import time

start_time = time.time()

market5.groupBy("CITY","BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(20) \
.toPandas()



print("----- %s secs -----" %(time.time() - start_time))

----- 31.6656231880188 secs -----


## Execute with bucketing+partitioning and 8

In [14]:
spark.conf.set("spark.sql.shuffle.partitions", 8)
spark.conf.get("spark.sql.shuffle.partitions")

'8'

In [15]:
market5_pby_region = spark.sql("select * from market5_pby_region")

In [16]:
market5_pby_region.limit(3).toPandas()

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE,REGION
0,4757121,1,1903,"COCA COLA 2,5 LT. EKO BOY",617176,2018-12-08,1.0,3.45,3.45,3.19,...,30,COCO COLA,İÇECEK,GAZLI İÇECEK,COLA,2018-12-09 19:48:31,2018-12-09 19:49:11,K,2018-07-14 02:25:02,Marmara
1,145836,1,18920,WINNER SLIMS,50248,2017-01-26,1.0,5.0,5.0,5.0,...,231,VİGOR,SİGARA,,,2017-01-27 14:24:33,2017-01-27 14:24:35,E,2018-07-14 02:24:06,Marmara
2,2067184,1,3117,PINAR MANGAL SUCUK,497754,2017-09-01,,29.75,3.27,3.03,...,118,PINAR,ET TAVUK,ET ŞARKÜTERİ,SUCUK,2017-09-02 11:03:50,2017-09-02 11:04:27,K,2018-07-14 02:16:24,Marmara


In [17]:
import time

start_time = time.time()

market5_pby_region.groupBy("CITY","BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(20) \
.toPandas()



print("----- %s secs -----" %(time.time() - start_time))

----- 5.07073187828064 secs -----


In [18]:
spark.stop()