In [1]:
import findspark
findspark.init("/opt/manual/spark")
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [2]:
spark = (SparkSession.builder
         .master("yarn")
         .appName("Spark Bucketing")
         .enableHiveSupport()
         .config("spark.sql.shuffle.partitions", 4)
         .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
         .config("spark.memory.fraction", "0.8")
         .config("spark.memory.storageFraction", "0.1")
         .getOrCreate())

2022-09-24 21:57:19,101 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-09-24 21:57:25,687 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


# read market1mil

In [3]:
# wget -O /home/train/datasets/market1mil.csv.gz https://github.com/erkansirin78/datasets/raw/master/market1mil.csv.gz

# hdfs dfs -put ~/datasets/market1mil.csv.gz /user/train/datasets

In [5]:
market1mil = spark.read.format("csv") \
.option("header", True) \
.option("inferSchema", True) \
.option("sep",";") \
.load("/user/train/datasets/market1mil.csv.gz") \
.orderBy(F.rand()).withColumn("LOGICALREF", F.col("LOGICALREF").cast(IntegerType()))

                                                                                

In [6]:
market1mil.limit(3).toPandas()

2022-09-24 22:11:47,822 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,CLIENTNAME,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE
0,980673,1,8121,ORKID ULTRA PLATINUM COMFORT 12AD GECE,245555,21.05.2017 00:00,1,595,595,504,...,Berkay ÇİMENDAĞ,105,ORKİD,KAĞIT,HİJYENİK PEDLER,,22.05.2017 10:48,22.05.2017 10:49,E,14.07.2018 02:01
1,33490,1,2865,ETI PETIT BEURRE 400GR.NO:13101,23653,10.01.2017 00:00,1,27,27,251,...,Ebubekir TULUMCU,44,ETİ,GIDA,BÜSKİVİ ÇEREZ,BÜSKİVİ,11.01.2017 12:21,11.01.2017 12:22,E,14.07.2018 01:56
2,569347,1,3290,F SAFF 6LT BULAŞIK DETERJANI LİMONLU,148417,26.03.2017 00:00,1,875,875,742,...,Ömer Asaf EPİK,50,F SAFF,DETERJAN TEMİZLİK,BULAŞIK YIKAMA,ELDE YIKAMA,27.03.2017 11:51,27.03.2017 11:51,E,14.07.2018 02:19


In [7]:
market1mil.printSchema()

root
 |-- LOGICALREF: integer (nullable = true)
 |-- COUNT_: integer (nullable = true)
 |-- ITEMCODE: integer (nullable = true)
 |-- ITEMNAME: string (nullable = true)
 |-- FICHENO: integer (nullable = true)
 |-- DATE_: string (nullable = true)
 |-- AMOUNT: string (nullable = true)
 |-- PRICE: string (nullable = true)
 |-- LINENETTOTAL: string (nullable = true)
 |-- LINENET: string (nullable = true)
 |-- BRANCHNR: integer (nullable = true)
 |-- BRANCH: string (nullable = true)
 |-- SALESMAN: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- REGION: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- CLIENTCODE: string (nullable = true)
 |-- CLIENTNAME: string (nullable = true)
 |-- BRANDCODE: string (nullable = true)
 |-- BRAND: string (nullable = true)
 |-- CATEGORY_NAME1: string (nullable = true)
 |-- CATEGORY_NAME2: string (nullable = true)
 |-- CATEGORY_NAME3: string (nullable = true)
 |-- STARTDATE: string 

# read market 5 million dataset

In [8]:
# open https://downgit.github.io
# paste https://github.com/erkansirin78/datasets/tree/master/market5mil_parquet
# cd ~/Downloads/
# unzip market5mil_parquet.zip 
# mv market5mil_parquet ~/datasets/
# hdfs dfs -put ~/datasets/market5mil_parquet/ /user/train/datasets

In [9]:
market5mil = spark.read.format("parquet") \
.load("/user/train/datasets/market5mil_parquet") \
.orderBy(F.rand())

                                                                                

In [10]:
market5mil.limit(3).toPandas()

                                                                                

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,CLIENTNAME,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE
0,5222006,1,8826,BINGOSIL KREM 470ML.AMONYAKLI YENI,412324,2018-08-10,1,2.35,2.35,1.99,...,Bilal KİRET,224,BİNGO,DETERJAN TEMİZLİK,EV TEMİZLEYİCİ,MUTFAK BANYO TEMİZLEYİCİ,2018-08-11 17:36:00,2018-08-11 17:36:35,E,2018-07-14 01:48:30
1,1826812,1,5362,SİHİRLİ ELLER CİG KÖFTE 200GR,439922,2017-08-20,1,2.8,2.8,2.59,...,Nuray NURSAL,346,SİHİRLİ ELLER,GIDA,HAZIR YEMEKLER,MEZE,2017-08-21 15:44:27,2017-08-21 15:46:42,K,2018-07-14 01:59:23
2,1714015,1,21313,H.SAKIR SIVI SAB.400ML BADEM,412373,2017-08-10,1,2.95,2.95,2.5,...,Eslem BOİS,58,H.ŞAKİR,KOZMETİK,DUŞ BANYO,SIVI JEL SABUNLAR,2017-08-11 15:00:07,2017-08-11 15:01:15,K,2018-07-14 02:22:52


In [11]:
market5mil.printSchema()

root
 |-- LOGICALREF: integer (nullable = true)
 |-- COUNT_: integer (nullable = true)
 |-- ITEMCODE: string (nullable = true)
 |-- ITEMNAME: string (nullable = true)
 |-- FICHENO: string (nullable = true)
 |-- DATE_: timestamp (nullable = true)
 |-- AMOUNT: integer (nullable = true)
 |-- PRICE: float (nullable = true)
 |-- LINENETTOTAL: float (nullable = true)
 |-- LINENET: float (nullable = true)
 |-- BRANCHNR: string (nullable = true)
 |-- BRANCH: string (nullable = true)
 |-- SALESMAN: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- REGION: string (nullable = true)
 |-- LATITUDE: float (nullable = true)
 |-- LONGITUDE: float (nullable = true)
 |-- CLIENTCODE: string (nullable = true)
 |-- CLIENTNAME: string (nullable = true)
 |-- BRANDCODE: string (nullable = true)
 |-- BRAND: string (nullable = true)
 |-- CATEGORY_NAME1: string (nullable = true)
 |-- CATEGORY_NAME2: string (nullable = true)
 |-- CATEGORY_NAME3: string (nullable = true)
 |-- STARTDATE: timestamp (

In [12]:
spark.conf.get("spark.sql.sources.bucketing.enabled")

'true'

# Join

In [13]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [14]:
# join and drop repeated cols from small table

joined_df = market1mil.join(market5mil, "LOGICALREF") \
.drop(*market1mil.columns)

In [15]:
# Measure the time spend for join

import time
start_time = time.time()

joined_df.limit(5).toPandas()

print("--- %s seconds ----" %(time.time() - start_time))

[Stage 9:>                                                          (0 + 1) / 1]

--- 16.879558801651 seconds ----


                                                                                

In [16]:
joined_df.explain("cost")

== Optimized Logical Plan ==
Project, Statistics(sizeInBytes=3.5 TiB)
+- Join Inner, (LOGICALREF#95 = LOGICALREF#152), Statistics(sizeInBytes=7.0 TiB)
   :- Project [cast(LOGICALREF#38 as int) AS LOGICALREF#95], Statistics(sizeInBytes=1035.3 KiB)
   :  +- Project [LOGICALREF#38], Statistics(sizeInBytes=2.4 MiB)
   :     +- Filter isnotnull(cast(LOGICALREF#38 as int)), Statistics(sizeInBytes=42.5 MiB)
   :        +- Relation[LOGICALREF#38,COUNT_#39,ITEMCODE#40,ITEMNAME#41,FICHENO#42,DATE_#43,AMOUNT#44,PRICE#45,LINENETTOTAL#46,LINENET#47,BRANCHNR#48,BRANCH#49,SALESMAN#50,CITY#51,REGION#52,LATITUDE#53,LONGITUDE#54,CLIENTCODE#55,CLIENTNAME#56,BRANDCODE#57,BRAND#58,CATEGORY_NAME1#59,CATEGORY_NAME2#60,CATEGORY_NAME3#61,... 4 more fields] csv, Statistics(sizeInBytes=42.5 MiB)
   +- Project [LOGICALREF#152], Statistics(sizeInBytes=7.0 MiB)
      +- Filter isnotnull(LOGICALREF#152), Statistics(sizeInBytes=227.0 MiB)
         +- Relation[LOGICALREF#152,COUNT_#153,ITEMCODE#154,ITEMNAME#155,FICHEN

# Write hive with buckets

### write market1mil

In [17]:
# note .bucketBy(8, "LOGICALREF") 
# And the time spend for writing

import time
start_time = time.time()

market1mil.orderBy(F.asc("LOGICALREF")) \
.write.format("parquet") \
.mode("overwrite") \
.bucketBy(8, "LOGICALREF") \
.saveAsTable("market1mil_tbl")

print("--- %s seconds ----" %(time.time() - start_time))

2022-09-24 22:14:36,453 WARN conf.HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
2022-09-24 22:14:36,456 WARN conf.HiveConf: HiveConf of name hive.stats.retries.wait does not exist
2022-09-24 22:16:11,126 WARN session.SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
2022-09-24 22:16:11,417 WARN conf.HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
2022-09-24 22:16:11,417 WARN conf.HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
2022-09-24 22:16:11,420 WARN conf.HiveConf: HiveConf of name hive.stats.retries.wait does not exist


--- 99.7611038684845 seconds ----


### write market5mil

In [18]:
import time
start_time = time.time()

market5mil.orderBy(F.asc("LOGICALREF")) \
.write.format("parquet") \
.mode("overwrite") \
.bucketBy(8, "LOGICALREF") \
.saveAsTable("market5mil_tbl")

print("--- %s seconds ----" %(time.time() - start_time))

                                                                                

--- 197.21966433525085 seconds ----


In [19]:
spark.sql("show tables").show()

2022-09-24 22:19:29,999 WARN metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


+--------+---------------+-----------+
|database|      tableName|isTemporary|
+--------+---------------+-----------+
| default|    advertising|      false|
| default|   iris_parquet|      false|
| default| market1mil_tbl|      false|
| default| market5mil_tbl|      false|
| default|order_items_tbl|      false|
| default|     orders_tbl|      false|
+--------+---------------+-----------+



# Join bucketed dataframes

In [20]:
# stop spark, restart notebook, create sparksession then continue from here

In [21]:
market1mil_tbl = spark.sql("select * from market1mil_tbl")

In [22]:
market1mil_tbl.limit(5).toPandas()

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,CLIENTNAME,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE
0,503303,1,904,ICIM SUT 1LT SISE SUT,132731,17.03.2017 00:00,1,26,26,241,...,Necdet KİREMİT,67,İÇİM SEHER,SÜT KAHVALTILIK,SÜT,PASTÖRİZE SÜT,18.03.2017 16:51,18.03.2017 16:59,E,14.07.2018 01:56
1,503305,1,904,ICIM SUT 1LT SISE SUT,132731,17.03.2017 00:00,1,26,26,241,...,Necdet KİREMİT,67,İÇİM SEHER,SÜT KAHVALTILIK,SÜT,PASTÖRİZE SÜT,18.03.2017 16:51,18.03.2017 16:59,E,14.07.2018 01:56
2,503313,1,5701,PORTAKAL,132731,17.03.2017 00:00,1845,195,36,333,...,Necdet KİREMİT,A25,HAL,MEYVE SEBZE,MEYVE,,18.03.2017 16:51,18.03.2017 16:59,E,14.07.2018 01:56
3,503319,1,21666,TUZSUZ FISTIK,132731,17.03.2017 00:00,21,129,271,251,...,Necdet KİREMİT,8,ARMONİ,GIDA,BÜSKİVİ ÇEREZ,KURUYEMİŞ,18.03.2017 16:51,18.03.2017 16:59,E,14.07.2018 01:56
4,503321,1,5715,PATATES,132731,17.03.2017 00:00,409,259,1059,981,...,Necdet KİREMİT,A25,HAL,MEYVE SEBZE,SEBZE,,18.03.2017 16:51,18.03.2017 16:59,E,14.07.2018 01:56


In [23]:
market5mil_tbl = spark.sql("select * from market5mil_tbl")

In [24]:
market5mil_tbl.limit(5).toPandas()

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,CLIENTNAME,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE
0,5067166,1,5692,KARPUZ,296319,2018-06-17,,0.7,4.42,4.09,...,Berra ÇIKLAİPLİKÇİ,A25,HAL,MEYVE SEBZE,MEYVE,,2018-06-18 14:00:26,2018-06-18 14:00:46,K,2018-07-14 01:51:02
1,4018570,1,19100,TMO OSMANCIK PIRINC 2 KG,321461,2018-06-30,1.0,5.8,5.8,5.37,...,Turgay OZGUREL,49,F NEFFİS,GIDA,BAKLİYAT,PAKET BAKLİYAT,2018-07-01 13:33:51,2018-07-01 13:34:29,E,2018-07-14 01:52:04
2,5067172,1,2080,ERIKLI DOGAL MEMBA SU 5 LT,296323,2018-06-17,1.0,2.25,2.25,2.09,...,Cafer SANTUR,43,ERİKLİ,İÇECEK,SU MADENSUYU,SU,2018-06-18 14:27:02,2018-06-18 14:27:26,E,2018-07-14 02:05:33
3,4018575,1,9928,HELEN H. KANATLI UZUN 16 AD,266131,2018-06-01,1.0,2.5,2.5,2.12,...,Elif su KURBANOVA,26,CANLEYDİ,KAĞIT,HİJYENİK PEDLER,,2018-06-02 17:27:14,2018-06-02 17:27:25,K,2018-07-14 02:15:46
4,5067193,1,2079,ERIKLI DOGAL MEMBA SU 500ML,296333,2018-06-17,1.0,0.4,0.4,0.37,...,Muhammed Emir DOMRUL,43,ERİKLİ,İÇECEK,SU MADENSUYU,SU,2018-06-18 15:15:33,2018-06-18 15:16:12,E,2018-07-14 02:07:16


In [25]:
joined_df2 = market1mil_tbl.join(market5mil_tbl, "LOGICALREF") \
.drop(*market1mil_tbl.columns)

In [26]:
import time
start_time = time.time()

joined_df2.limit(5).toPandas()

print("--- %s seconds ----" %(time.time() - start_time))

[Stage 18:>                                                         (0 + 1) / 1]

--- 2.2843732833862305 seconds ----


                                                                                

In [27]:
joined_df2.explain("cost")

== Optimized Logical Plan ==
Project, Statistics(sizeInBytes=5.0 TiB)
+- Join Inner, (LOGICALREF#381 = LOGICALREF#437), Statistics(sizeInBytes=10.0 TiB)
   :- Project [LOGICALREF#381], Statistics(sizeInBytes=1247.7 KiB)
   :  +- Filter isnotnull(LOGICALREF#381), Statistics(sizeInBytes=49.5 MiB)
   :     +- Relation[LOGICALREF#381,COUNT_#382,ITEMCODE#383,ITEMNAME#384,FICHENO#385,DATE_#386,AMOUNT#387,PRICE#388,LINENETTOTAL#389,LINENET#390,BRANCHNR#391,BRANCH#392,SALESMAN#393,CITY#394,REGION#395,LATITUDE#396,LONGITUDE#397,CLIENTCODE#398,CLIENTNAME#399,BRANDCODE#400,BRAND#401,CATEGORY_NAME1#402,CATEGORY_NAME2#403,CATEGORY_NAME3#404,... 4 more fields] parquet, Statistics(sizeInBytes=49.5 MiB)
   +- Project [LOGICALREF#437], Statistics(sizeInBytes=8.2 MiB)
      +- Filter isnotnull(LOGICALREF#437), Statistics(sizeInBytes=267.5 MiB)
         +- Relation[LOGICALREF#437,COUNT_#438,ITEMCODE#439,ITEMNAME#440,FICHENO#441,DATE_#442,AMOUNT#443,PRICE#444,LINENETTOTAL#445,LINENET#446,BRANCHNR#447,BRAN

In [28]:
# It is expected to shorter time for join operation with bucketed tables

In [12]:
spark.stop()