In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Partioning - Repartition and Coalesce").getOrCreate()
sc = spark.sparkContext

In [3]:
from pyspark.sql import functions as F

In [11]:
spark

In [9]:
sc

In [6]:
sc.defaultParallelism

8

In [8]:
range_df = spark.range(1000)

In [9]:
type(range_df)

pyspark.sql.dataframe.DataFrame

In [10]:
range_df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+
only showing top 20 rows



In [16]:
range_rdd = sc.range(100)

In [17]:
type(range_rdd)

pyspark.rdd.PipelinedRDD

In [18]:
range_rdd.take(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [20]:
spark.range(200,100,-2).show(50)

+---+
| id|
+---+
|200|
|198|
|196|
|194|
|192|
|190|
|188|
|186|
|184|
|182|
|180|
|178|
|176|
|174|
|172|
|170|
|168|
|166|
|164|
|162|
|160|
|158|
|156|
|154|
|152|
|150|
|148|
|146|
|144|
|142|
|140|
|138|
|136|
|134|
|132|
|130|
|128|
|126|
|124|
|122|
|120|
|118|
|116|
|114|
|112|
|110|
|108|
|106|
|104|
|102|
+---+



Both the below functions take same parameter as of Python <b><i>range()</i></b> function:-
<ul>
    <li><b><u>spark.range()</u></b> - Creates a Spark Dataframe, with single column named <i>'id'</i></li>
    <li><b><u>spark.range()</u></b> - Creates an RDD, with specified range.</li>
</ul>

In [33]:
range_df.select(range_df.id, F.col('id')*2, range_df.id*3).withColumnRenamed('(id * 2)','id2').show()

+---+---+--------+
| id|id2|(id * 3)|
+---+---+--------+
|  0|  0|       0|
|  1|  2|       3|
|  2|  4|       6|
|  3|  6|       9|
|  4|  8|      12|
|  5| 10|      15|
|  6| 12|      18|
|  7| 14|      21|
|  8| 16|      24|
|  9| 18|      27|
| 10| 20|      30|
| 11| 22|      33|
| 12| 24|      36|
| 13| 26|      39|
| 14| 28|      42|
| 15| 30|      45|
| 16| 32|      48|
| 17| 34|      51|
| 18| 36|      54|
| 19| 38|      57|
+---+---+--------+
only showing top 20 rows



## Creating Dump Data File :
<hr>

In [36]:
df = spark.range(1000000)
df = df.select(df.id,df.id*2,df.id*3)
df = df.union(df)
df = df.union(df)
df = df.union(df)
df = df.union(df)
df = df.union(df)

In [37]:
df.show(5)

+---+--------+--------+
| id|(id * 2)|(id * 3)|
+---+--------+--------+
|  0|       0|       0|
|  1|       2|       3|
|  2|       4|       6|
|  3|       6|       9|
|  4|       8|      12|
+---+--------+--------+
only showing top 5 rows



In [38]:
df.rdd.take(5)

[Row(id=0, (id * 2)=0, (id * 3)=0),
 Row(id=1, (id * 2)=2, (id * 3)=3),
 Row(id=2, (id * 2)=4, (id * 3)=6),
 Row(id=3, (id * 2)=6, (id * 3)=9),
 Row(id=4, (id * 2)=8, (id * 3)=12)]

In [39]:
rdd = df.rdd.map( lambda x: str(x[0]) + ',' + str(x[1]) + ',' + str(x[2]) )

In [40]:
rdd.take(5)

['0,0,0', '1,2,3', '2,4,6', '3,6,9', '4,8,12']

In [41]:
rdd.coalesce(1).saveAsTextFile('data/dump_data.txt')

## Reading the dumped file, and checking it for parttition change:-
<hr>

In [5]:
rdd = sc.textFile('data/dump_data.txt')

In [6]:
rdd.take(10)

['0,0,0',
 '1,2,3',
 '2,4,6',
 '3,6,9',
 '4,8,12',
 '5,10,15',
 '6,12,18',
 '7,14,21',
 '8,16,24',
 '9,18,27']

In [7]:
rdd.getNumPartitions()

21

In [9]:
sc.defaultParallelism

8

In [10]:
rdd.count()

32000000

In [13]:
rdd2 = rdd.map( lambda x: x.split(',') ).filter( lambda x: int(x[0]) == 1 )

In [14]:
rdd2.count()

32

In [15]:
rdd2.collect()

[['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3'],
 ['1', '2', '3']]

In [16]:
rdd2.getNumPartitions()

21

Here, we can see that, even if <u>*rdd2*</u> has only 32 records, it is also divided into 21 partitons, as because it is a pipelinedRDD formed by applying *filter()* transformation on *rdd*, and *rdd* has 21 partitions due to its size.

In [22]:
rdd2.coalesce(22).getNumPartitions()

21

In [27]:
rdd2.repartition(22).getNumPartitions()

22

# Repartition and Sorting:-
<hr>
Using <b><i>repartitionAndSortWithinPartitions(numPartitions=None, "partition_function", ascending=True</i></b>

In [17]:
data = [
    (9,('a','z')),
    (4,('c','s')),
    (11,('f','f')),
    (24,('g','g')),
    (19,('f','h')),
    (18,('m','t')),
    (76,('f','k')),
    (-4,('s','t')),
    (8,('f','k')),
    (129,('p','o')),
    (56,('f','l')),
    (85,('q','q')),
    (108,('f','t')),
]

rdd = sc.parallelize(data)

In [18]:
rdd2 = rdd.repartitionAndSortWithinPartitions(2, lambda x: x%2, ascending=True)

In [19]:
rdd2.getNumPartitions()

2

In [20]:
for i in rdd2.glom().collect():
    print(i, end='\n\n')

[(-4, ('s', 't')), (4, ('c', 's')), (8, ('f', 'k')), (18, ('m', 't')), (24, ('g', 'g')), (56, ('f', 'l')), (76, ('f', 'k')), (108, ('f', 't'))]

[(9, ('a', 'z')), (11, ('f', 'f')), (19, ('f', 'h')), (85, ('q', 'q')), (129, ('p', 'o'))]



All the elements are partitioned according to their key being odd or even, and in the partitions, the elements exists in sorted order, as can bee seen in the result above.
<hr>

# Extraction:-
---------------
### Saving results in RDD as textFile in HDFS

In [7]:
# Get the list of supported compression codecs
compression_codecs = sc._jvm.org.apache.hadoop.io.compress.CompressionCodecFactory.getCodecClasses()
codec_names = [codec.getSimpleName() for codec in compression_codecs]

Py4JError: An error occurred while calling z:org.apache.hadoop.io.compress.CompressionCodecFactory.getCodecClasses. Trace:
py4j.Py4JException: Method getCodecClasses([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:339)
	at py4j.Gateway.invoke(Gateway.java:276)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)



In [None]:
# Print the codec names
print("Supported Compression Codecs:")
for codec_name in codec_names:
    print(codec_name)

In [12]:
# Get the list of supported compression codecs
compression_codecs = spark._jvm.org.apache.hadoop.io.compress.CompressionCodecFactory.getCodecClasses()
codec_names = [codec.getSimpleName() for codec in compression_codecs]

# Print the codec names
print("Supported Compression Codecs:")
for codec_name in codec_names:
    print(codec_name)

Py4JError: An error occurred while calling z:org.apache.hadoop.io.compress.CompressionCodecFactory.getCodecClasses. Trace:
py4j.Py4JException: Method getCodecClasses([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:339)
	at py4j.Gateway.invoke(Gateway.java:276)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)



### Q3-1. Find the number of customers placed order in the month of July and August. Store the output at HDFS in textFile format. Create 5 files. Compressions format should be bzip2.

In [16]:
ord_rdd = sc.textFile('data/orders.csv').map( lambda x: x.split(',') )

In [17]:
ord_rdd.take(5)

[['1', '2013-07-25 00:00:00.0', '11599', 'CLOSED'],
 ['2', '2013-07-25 00:00:00.0', '256', 'PENDING_PAYMENT'],
 ['3', '2013-07-25 00:00:00.0', '12111', 'COMPLETE'],
 ['4', '2013-07-25 00:00:00.0', '8827', 'CLOSED'],
 ['5', '2013-07-25 00:00:00.0', '11318', 'COMPLETE']]

In [21]:
ord_in_jul_aug = ord_rdd.map( lambda x: (int(x[1].split('-')[1]), int(x[2])) ).filter( lambda x: x[0] in (7,8) )

In [24]:
cust_ordered_in_jul_aug = ord_in_jul_aug.map( lambda x: x[1] ).distinct()

In [26]:
cust_ordered_in_jul_aug.take(20)

[256,
 11318,
 7130,
 4530,
 5648,
 918,
 9842,
 2568,
 7276,
 9488,
 9198,
 7562,
 656,
 196,
 3960,
 4840,
 11586,
 8214,
 12092,
 8136]

In [27]:
cust_ordered_in_jul_aug.count()

7633

In [28]:
cust_ordered_in_jul_aug.getNumPartitions()

2

In [29]:
cust_ordered_in_jul_aug = cust_ordered_in_jul_aug.repartition(5)

In [30]:
cust_ordered_in_jul_aug.getNumPartitions()

5

In [31]:
cust_ordered_in_jul_aug.glom().map(len).collect()

[1526, 1527, 1520, 1530, 1530]

In [32]:
cust_ordered_in_jul_aug.saveAsTextFile('data/customers_ordered_in_july_august.txt')

In [33]:
cust_ordered_in_jul_aug.saveAsTextFile('data/test', compressionCodecClass='org.apache.hadoop.io.compress.BZip2Codec')

## Saving as Sequence File:-
<hr>

In [34]:
cust_ordered_in_jul_aug.take(5)

[824, 8912, 6932, 2256, 9032]

In [35]:
cust_ordered_in_jul_aug.saveAsSequenceFile('data/ord_jul_aug_seq')

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.saveAsSequenceFile.
: org.apache.spark.SparkException: RDD element of type java.lang.Integer cannot be used
	at org.apache.spark.api.python.SerDeUtil$.pythonToPairRDD(SerDeUtil.scala:210)
	at org.apache.spark.api.python.PythonRDD$.saveAsHadoopFile(PythonRDD.scala:608)
	at org.apache.spark.api.python.PythonRDD$.saveAsSequenceFile(PythonRDD.scala:585)
	at org.apache.spark.api.python.PythonRDD.saveAsSequenceFile(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)


### The above code shows error, because we are trying to write a normal rdd as sequence file.
<hr>
<h3> A sequence file should always be used for writing a key-value RDD, as follows: </h3>

In [36]:
ord_in_jul_aug.take(10)

[(7, 11599),
 (7, 256),
 (7, 12111),
 (7, 8827),
 (7, 11318),
 (7, 7130),
 (7, 4530),
 (7, 2911),
 (7, 5657),
 (7, 5648)]

In [37]:
ord_in_jul_aug.saveAsSequenceFile('data/ord_jul_aug_seq')

### Checking the contents of the sequence file being written:-

In [38]:
seq_file_rdd = sc.sequenceFile('data/ord_jul_aug_seq')

In [39]:
seq_file_rdd.take(10)

[(7, 11599),
 (7, 256),
 (7, 12111),
 (7, 8827),
 (7, 11318),
 (7, 7130),
 (7, 4530),
 (7, 2911),
 (7, 5657),
 (7, 5648)]

If we do not have suitable key in an RDD, then to write that RDD as sequence file, we can use <b><i>None</i></b> as the key.