In [1]:
import findspark
findspark.init("/opt/manual/spark")
from pyspark.sql import SparkSession, functions as F

In [2]:
spark = (
    SparkSession.builder
    .appName("Spark Partitioning")
    .master("yarn")
    .enableHiveSupport()
    .getOrCreate())

In [3]:
market5mil = spark.read.format("parquet") \
.load("/user/train/datasets/market5mil_parquet") \
.orderBy(F.rand())

In [4]:
market5mil.limit(2).toPandas()

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,CLIENTNAME,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE
0,3239031,1,11742,ULKER GRANINI 330 ML KARISIK,230439,2018-05-12,1,1.25,1.25,1.16,...,Şaziye FERSİZ,146,ÜLKER,İÇECEK,GAZSIZ İÇECEK,MEYVE SUYU,2018-05-13 12:12:40,2018-05-13 12:12:44,K,2018-07-14 02:09:02
1,3208362,1,8664,BINGO SIVI DET.4 LT.RENKLI,226316,2018-05-10,1,14.25,14.25,12.08,...,Hamdi REİSOGLU,224,BİNGO,DETERJAN TEMİZLİK,ÇAMAŞIR YIKAMA,SIVI JEL DETERJAN,2018-05-11 14:30:46,2018-05-11 14:33:46,E,2018-07-14 02:17:04


In [5]:
market5mil.printSchema()

root
 |-- LOGICALREF: integer (nullable = true)
 |-- COUNT_: integer (nullable = true)
 |-- ITEMCODE: string (nullable = true)
 |-- ITEMNAME: string (nullable = true)
 |-- FICHENO: string (nullable = true)
 |-- DATE_: timestamp (nullable = true)
 |-- AMOUNT: integer (nullable = true)
 |-- PRICE: float (nullable = true)
 |-- LINENETTOTAL: float (nullable = true)
 |-- LINENET: float (nullable = true)
 |-- BRANCHNR: string (nullable = true)
 |-- BRANCH: string (nullable = true)
 |-- SALESMAN: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- REGION: string (nullable = true)
 |-- LATITUDE: float (nullable = true)
 |-- LONGITUDE: float (nullable = true)
 |-- CLIENTCODE: string (nullable = true)
 |-- CLIENTNAME: string (nullable = true)
 |-- BRANDCODE: string (nullable = true)
 |-- BRAND: string (nullable = true)
 |-- CATEGORY_NAME1: string (nullable = true)
 |-- CATEGORY_NAME2: string (nullable = true)
 |-- CATEGORY_NAME3: string (nullable = true)
 |-- STARTDATE: timestamp (

In [6]:
import time 
start_time = time.time()

market5mil.groupBy("CITY", "BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(10) \
.toPandas()

print("--- %s seconds ---" %(time.time()- start_time))

--- 38.93350434303284 seconds ---


# Write to hive

In [8]:
start_time = time.time()

market5mil.orderBy("CITY") \
.write.format("parquet") \
.partitionBy("REGION") \
.bucketBy(8, "CITY") \
.mode("overwrite") \
.saveAsTable("market5mil_pby_region")


print("----- %s -----" %(time.time() - start_time))

----- 183.67112684249878 -----


In [9]:
market5mil_pby_region = spark.sql("select * from market5mil_pby_region")

In [10]:
market5mil_pby_region.limit(3).toPandas()

Unnamed: 0,LOGICALREF,COUNT_,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,...,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,SPECODE,CAPIBLOCK_CREADEDDATE,REGION
0,210147,1,2282,NESTLE PASTANEM KOSTEBEK 120,64854,2017-02-04,1.0,4.25,4.25,3.94,...,97,NESTLE,GIDA,ÇİKOLATA GOFRET,,2017-02-05 15:24:41,2017-02-05 15:24:53,E,2018-07-14 01:45:19,Akdeniz
1,41370,1,5694,DOMATES,25356,2017-01-11,,2.5,2.76,2.56,...,A25,HAL,MEYVE SEBZE,SEBZE,,2017-01-12 18:24:05,2017-01-12 18:25:16,E,2018-07-14 01:58:04,Akdeniz
2,192242,1,11,TRABZON&KOY EKMEGI,60978,2017-02-02,1.0,3.0,3.0,2.97,...,,,GIDA,UNLU MAMÜLLER,EKMEK,2017-02-03 10:57:12,2017-02-03 10:58:17,K,2018-07-14 02:19:40,Akdeniz


In [11]:
import time 
start_time = time.time()

market5mil.groupBy("CITY", "BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(10) \
.toPandas()

print("--- %s seconds ---" %(time.time()- start_time))

--- 34.97564673423767 seconds ---


In [12]:
import time 
start_time = time.time()

market5mil_pby_region.groupBy("CITY", "BRANCH").agg(F.sum(F.col("LINENETTOTAL")).alias("Total")) \
.orderBy(F.desc("Total")) \
.limit(20) \
.toPandas()

print("--- %s seconds ---" %(time.time()- start_time))

--- 5.712671518325806 seconds ---


In [13]:
! hdfs dfs -ls /user/hive/warehouse/market5mil_pby_region

Found 8 items
drwxr-xr-x   - train hive          0 2025-06-14 13:10 /user/hive/warehouse/market5mil_pby_region/REGION=Akdeniz
drwxr-xr-x   - train hive          0 2025-06-14 13:10 /user/hive/warehouse/market5mil_pby_region/REGION=Doğu Anadolu
drwxr-xr-x   - train hive          0 2025-06-14 13:10 /user/hive/warehouse/market5mil_pby_region/REGION=Ege
drwxr-xr-x   - train hive          0 2025-06-14 13:10 /user/hive/warehouse/market5mil_pby_region/REGION=Güneydoğu Anadolu
drwxr-xr-x   - train hive          0 2025-06-14 13:10 /user/hive/warehouse/market5mil_pby_region/REGION=Karadeniz
drwxr-xr-x   - train hive          0 2025-06-14 13:10 /user/hive/warehouse/market5mil_pby_region/REGION=Marmara
drwxr-xr-x   - train hive          0 2025-06-14 13:10 /user/hive/warehouse/market5mil_pby_region/REGION=İç Anadolu
-rw-r--r--   1 train hive          0 2025-06-14 13:10 /user/hive/warehouse/market5mil_pby_region/_SUCCESS


In [14]:
spark.stop()