# PARTITIONS

## Default Values

In [0]:
sc.defaultMinPartitions

Out[1]: 2

In [0]:
sc.defaultParallelism

Out[2]: 8

In [0]:
spark.conf.get("spark.sql.files.maxPartitionBytes")

Out[3]: '134217728b'

## Data per partition

In [0]:
from pyspark.sql.types import IntegerType

df_range = spark.createDataFrame(range(10), IntegerType())
df_range.show()

+-----+
|value|
+-----+
|    0|
|    1|
|    2|
|    3|
|    4|
|    5|
|    6|
|    7|
|    8|
|    9|
+-----+



In [0]:
df_range.rdd.getNumPartitions()

Out[5]: 8

In [0]:
df_range.rdd.glom().collect()

Out[6]: [[Row(value=0)],
 [Row(value=1)],
 [Row(value=2)],
 [Row(value=3), Row(value=4)],
 [Row(value=5)],
 [Row(value=6)],
 [Row(value=7)],
 [Row(value=8), Row(value=9)]]

##Change Partition Size

In [0]:
dbutils.fs.ls("dbfs:/FileStore/tables/fantasy/")

Out[7]: [FileInfo(path='dbfs:/FileStore/tables/fantasy/characters.csv', name='characters.csv', size=1123, modificationTime=1723331905000),
 FileInfo(path='dbfs:/FileStore/tables/fantasy/inventory.csv', name='inventory.csv', size=1122, modificationTime=1723331932000),
 FileInfo(path='dbfs:/FileStore/tables/fantasy/items.csv', name='items.csv', size=1307, modificationTime=1723331936000)]

In [0]:
df_multiple_files = spark.read.format("csv").option("inferschema", True).option("header",True).option("sep",",").load('dbfs:/FileStore/tables/fantasy/')

df_multiple_files.rdd.getNumPartitions()

Out[8]: 3

In [0]:
spark.conf.set("spark.sql.files.maxPartitionBytes",1000)
spark.conf.get("spark.sql.files.maxPartitionBytes")

Out[9]: '1000'

In [0]:
df_multiple_files = spark.read.format("csv").option("inferschema", True).option("header",True).option("sep",",").load('dbfs:/FileStore/tables/fantasy/')

df_multiple_files.rdd.getNumPartitions()

Out[10]: 6

##Single Partition

In [0]:
df_single_partition = sc.parallelize(range(100),1)

df_single_partition.getNumPartitions()

Out[11]: 1

#REPARTITION

In [0]:
from pyspark.sql.types import IntegerType

df_range = spark.createDataFrame(range(20), IntegerType())
df_range.rdd.getNumPartitions()

Out[12]: 8

In [0]:
df_range.rdd.glom().collect()

Out[13]: [[Row(value=0), Row(value=1)],
 [Row(value=2), Row(value=3)],
 [Row(value=4), Row(value=5)],
 [Row(value=6), Row(value=7), Row(value=8), Row(value=9)],
 [Row(value=10), Row(value=11)],
 [Row(value=12), Row(value=13)],
 [Row(value=14), Row(value=15)],
 [Row(value=16), Row(value=17), Row(value=18), Row(value=19)]]

## Aument w/repartition

In [0]:
df_range_repartition = df_range.repartition(16)

df_range_repartition.rdd.getNumPartitions()

Out[14]: 16

In [0]:
df_range_repartition.rdd.glom().collect()

Out[15]: [[Row(value=5), Row(value=12)],
 [Row(value=4)],
 [],
 [],
 [Row(value=0), Row(value=3)],
 [Row(value=1), Row(value=2)],
 [],
 [Row(value=19)],
 [Row(value=16)],
 [Row(value=18)],
 [Row(value=11), Row(value=17)],
 [Row(value=10)],
 [Row(value=7)],
 [Row(value=6), Row(value=14)],
 [Row(value=8), Row(value=15)],
 [Row(value=9), Row(value=13)]]

##Reduce w/repartition

In [0]:
df_range_repartition_v2 = df_range.repartition(2)

df_range_repartition_v2.rdd.getNumPartitions()

Out[16]: 2

In [0]:
df_range_repartition_v2.rdd.glom().collect()

Out[17]: [[Row(value=1),
  Row(value=2),
  Row(value=5),
  Row(value=7),
  Row(value=8),
  Row(value=11),
  Row(value=13),
  Row(value=14),
  Row(value=16),
  Row(value=17)],
 [Row(value=0),
  Row(value=3),
  Row(value=4),
  Row(value=6),
  Row(value=9),
  Row(value=10),
  Row(value=12),
  Row(value=15),
  Row(value=19),
  Row(value=18)]]

#COALESCE

In [0]:
df_range_coalesce = df_range.coalesce(2)

df_range_coalesce.rdd.getNumPartitions()

Out[18]: 2

In [0]:
df_range_coalesce.rdd.glom().collect()

Out[19]: [[Row(value=0),
  Row(value=1),
  Row(value=2),
  Row(value=3),
  Row(value=4),
  Row(value=5),
  Row(value=6),
  Row(value=7),
  Row(value=8),
  Row(value=9)],
 [Row(value=10),
  Row(value=11),
  Row(value=12),
  Row(value=13),
  Row(value=14),
  Row(value=15),
  Row(value=16),
  Row(value=17),
  Row(value=18),
  Row(value=19)]]