In [1]:
import findspark
findspark.init("/opt/manual/spark")
from pyspark.sql import SparkSession, functions as F

In [2]:
spark = (SparkSession.builder
         .master("yarn")
         .appName("Spark Partitioning")
         .enableHiveSupport()
         .config("spark.sql.shuffle.partitions", 4)
         .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
         .config("spark.memory.fraction", "0.8")
         .config("spark.memory.storageFraction", "0.1")
         .getOrCreate())

2022-09-24 22:34:04,269 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-09-24 22:34:10,685 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


# Read data

In [3]:
market5 = spark.read.format("parquet") \
.load("/user/train/datasets/market5mil_parquet") \
.orderBy(F.rand())

                                                                                

In [4]:
market5.limit(3).toPandas()

2022-09-24 22:35:07,268 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,CLIENTNAME,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE
0,1785069,1,23560,KOSKA KIS HELVASI KAKAOLU 2012,430203,2017-08-15,,10.95,3.57,3.31,...,Alper ARUKASLAN,83,KOSKA,SÜT KAHVALTILIK,KAHVALTILIK,HELVA,2017-08-16 07:16:51,2017-08-16 07:17:45,E,2018-07-14 01:53:47
1,3015690,1,22628,ULKER HANIMELLER 110 GR.118-,203778,2018-04-27,1.0,1.0,1.0,0.93,...,Beril AĞBABA,146,ÜLKER,GIDA,BÜSKİVİ ÇEREZ,BÜSKİVİ,2018-04-28 20:00:22,2018-04-28 20:02:12,K,2018-07-14 01:51:48
2,2023103,1,3612,ETI BURCAK 140 GR,487198,2017-08-30,1.0,1.0,1.0,0.93,...,Resul AKTINAKI,44,ETİ,GIDA,BÜSKİVİ ÇEREZ,BÜSKİVİ,2017-08-31 14:36:18,2017-08-31 14:37:42,E,2018-07-14 02:05:43


In [5]:
market5.printSchema()

root
 |-- LOGICALREF: integer (nullable = true)
 |-- COUNT_: integer (nullable = true)
 |-- ITEMCODE: string (nullable = true)
 |-- ITEMNAME: string (nullable = true)
 |-- FICHENO: string (nullable = true)
 |-- DATE_: timestamp (nullable = true)
 |-- AMOUNT: integer (nullable = true)
 |-- PRICE: float (nullable = true)
 |-- LINENETTOTAL: float (nullable = true)
 |-- LINENET: float (nullable = true)
 |-- BRANCHNR: string (nullable = true)
 |-- BRANCH: string (nullable = true)
 |-- SALESMAN: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- REGION: string (nullable = true)
 |-- LATITUDE: float (nullable = true)
 |-- LONGITUDE: float (nullable = true)
 |-- CLIENTCODE: string (nullable = true)
 |-- CLIENTNAME: string (nullable = true)
 |-- BRANDCODE: string (nullable = true)
 |-- BRAND: string (nullable = true)
 |-- CATEGORY_NAME1: string (nullable = true)
 |-- CATEGORY_NAME2: string (nullable = true)
 |-- CATEGORY_NAME3: string (nullable = true)
 |-- STARTDATE: timestamp (

In [6]:
# shuffle requiring group by query

import time

start_time = time.time()

market5.groupBy("CITY","BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(20) \
.toPandas()



print("----- %s secs -----" %(time.time() - start_time))

                                                                                

----- 24.290632486343384 secs -----


# Write to hive

In [7]:
start_time = time.time()

market5.orderBy("CITY") \
.write.format("parquet") \
.partitionBy("REGION") \
.bucketBy(8, "CITY") \
.mode("overwrite") \
.saveAsTable("market5_pby_region")



print("----- %s secs -----" %(time.time() - start_time))

2022-09-24 22:36:17,251 WARN conf.HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
2022-09-24 22:36:17,254 WARN conf.HiveConf: HiveConf of name hive.stats.retries.wait does not exist
2022-09-24 22:40:54,378 WARN session.SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
2022-09-24 22:40:56,658 WARN conf.HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
2022-09-24 22:40:56,658 WARN conf.HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
2022-09-24 22:40:56,661 WARN conf.HiveConf: HiveConf of name hive.stats.retries.wait does not exist
2022-09-24 22:40:59,660 WARN metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


----- 291.32809805870056 secs -----


In [8]:
market5_pby_region = spark.sql("select * from market5_pby_region")

In [9]:
market5_pby_region.limit(3).toPandas()

                                                                                

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE,REGION
0,2133878,1,8367,FAMILIA PECETE 100 AD.,514329,2017-09-18,2,2.25,4.5,3.81,...,226,FAMİLY,KAĞIT,KAĞIT PEÇETELER,,2017-09-19 19:56:43,2017-09-19 20:00:23,E,2018-07-14 02:20:51,Marmara
1,145836,1,18920,WINNER SLIMS,50248,2017-01-26,1,5.0,5.0,5.0,...,231,VİGOR,SİGARA,,,2017-01-27 14:24:33,2017-01-27 14:24:35,E,2018-07-14 02:24:06,Marmara
2,4728998,1,3130,PINAR PIZZA 4LU EKO PAKET,614079,2018-12-06,1,12.35,12.35,11.44,...,118,PINAR,GIDA,DONDURULMUŞ GIDA,,2018-12-07 17:16:33,2018-12-07 17:17:20,E,2018-07-14 02:15:48,Marmara


In [10]:
import time

start_time = time.time()

market5_pby_region.groupBy("CITY","BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(20) \
.toPandas()



print("----- %s secs -----" %(time.time() - start_time))



----- 14.700294494628906 secs -----


                                                                                

In [11]:
! hdfs dfs -ls /user/hive/warehouse/market5_pby_region

Found 8 items
drwxr-xr-x   - train hive          0 2022-09-24 22:40 /user/hive/warehouse/market5_pby_region/REGION=Akdeniz
drwxr-xr-x   - train hive          0 2022-09-24 22:40 /user/hive/warehouse/market5_pby_region/REGION=Doğu Anadolu
drwxr-xr-x   - train hive          0 2022-09-24 22:40 /user/hive/warehouse/market5_pby_region/REGION=Ege
drwxr-xr-x   - train hive          0 2022-09-24 22:40 /user/hive/warehouse/market5_pby_region/REGION=Güneydoğu Anadolu
drwxr-xr-x   - train hive          0 2022-09-24 22:40 /user/hive/warehouse/market5_pby_region/REGION=Karadeniz
drwxr-xr-x   - train hive          0 2022-09-24 22:40 /user/hive/warehouse/market5_pby_region/REGION=Marmara
drwxr-xr-x   - train hive          0 2022-09-24 22:40 /user/hive/warehouse/market5_pby_region/REGION=İç Anadolu
-rw-r--r--   1 train hive          0 2022-09-24 22:40 /user/hive/warehouse/market5_pby_region/_SUCCESS


In [12]:
spark.stop()