# PARTITION BY

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/20 18:32:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df_bank_customer = spark.read.format("csv").option("inferschema",True).option("header",True).option("sep",",").load("../datasets/bank_customers.csv")

In [4]:
df_bank_customer.show()

+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId|  Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        1|  15634602| Hargrave|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|     Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|     Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        4|  15701354|     Boni|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93

In [5]:
df_bank_customer.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



## PartitionBy One Column

In [6]:
df_bank_customer.groupBy("Geography").count().show(truncate=False)

+---------+-----+
|Geography|count|
+---------+-----+
|Germany  |2509 |
|France   |5014 |
|Spain    |2477 |
+---------+-----+



In [7]:
df_bank_customer.write.option("header",True).partitionBy("Geography").mode("overwrite").csv("../output_partitions/bank/")

                                                                                

In [8]:
%ls ../output_partitions/bank/

[0m[01;34m'Geography=France'[0m/  [01;34m'Geography=Germany'[0m/  [01;34m'Geography=Spain'[0m/   _SUCCESS


## PartitionBy Multiple Columns

In [10]:
df_bank_customer.groupBy("Geography","HasCrCard").count().show(truncate=False)

+---------+---------+-----+
|Geography|HasCrCard|count|
+---------+---------+-----+
|Germany  |0        |718  |
|Spain    |0        |756  |
|France   |0        |1471 |
|Germany  |1        |1791 |
|France   |1        |3543 |
|Spain    |1        |1721 |
+---------+---------+-----+



In [11]:
df_bank_customer.write.option("header",True).partitionBy("Geography","HasCrCard").mode("overwrite").csv("../output_partitions/bank_v2/")

                                                                                

In [15]:
%ls ../output_partitions/bank_v2/Geography=Germany/HasCrCard=1/

part-00000-3ed94a69-1ce7-4c92-b81c-c041811d090d.c000.csv


In [16]:
df_bank_customer.write.option("header",True).partitionBy("HasCrCard","Geography").mode("overwrite").csv("../output_partitions/bank_v3/")

                                                                                

In [18]:
%ls ../output_partitions/bank_v3/HasCrCard=0/Geography=Germany

part-00000-b05d5a5e-3942-4177-ac65-9af676e69a72.c000.csv


## PartitionBy limiting maximum number of records

In [19]:
df_bank_customer.groupBy("HasCrCard").count().show(truncate=False)

+---------+-----+
|HasCrCard|count|
+---------+-----+
|1        |7055 |
|0        |2945 |
+---------+-----+



In [26]:
df_bank_customer.write.option("header",True).option("maxRecordsPerFile",4000).partitionBy("HasCrCard").mode("overwrite").csv("../output_partitions/bank_v4/")

In [27]:
%ls ../output_partitions/bank_v4/HasCrCard=1/

part-00000-c0711a68-5d59-4b2a-8b7d-7251acfe7df7.c000.csv
part-00000-c0711a68-5d59-4b2a-8b7d-7251acfe7df7.c001.csv
