In [1]:
import findspark

findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RDD persistence and shared variable").getOrCreate()
sc = spark.sparkContext

In [3]:
print( sc.defaultParallelism )

8


In [4]:
spark.conf.get("spark.sql.shuffle.partitions")

'200'

In [5]:
spark.conf.set("spark.sql.shuffle.partitions", sc.defaultParallelism)

In [6]:
spark.conf.get("spark.sql.shuffle.partitions")

'8'

In [7]:
input_data = [
    ("James", "Smith", "USA", "mon"),
    ("Michael", "Rose", "USA", "tue"),
    ("Robert", "Williams", "USA", "sun"),
    ("Maria", "Jones", "USA", "tue")
]

weekday = {"sun": "Sunday", "mon": "Monday", "tue": "Tuesday"}

In [8]:
input_rdd = sc.parallelize(input_data)

In [9]:
input_rdd.getNumPartitions()

8

In [10]:
input_rdd.glom().collect()

[[],
 [('James', 'Smith', 'USA', 'mon')],
 [],
 [('Michael', 'Rose', 'USA', 'tue')],
 [],
 [('Robert', 'Williams', 'USA', 'sun')],
 [],
 [('Maria', 'Jones', 'USA', 'tue')]]

In [11]:
input_rdd = input_rdd.coalesce(4)

In [12]:
input_rdd.getNumPartitions()

4

In [13]:
input_rdd.glom().collect()

[[('James', 'Smith', 'USA', 'mon')],
 [('Michael', 'Rose', 'USA', 'tue')],
 [('Robert', 'Williams', 'USA', 'sun')],
 [('Maria', 'Jones', 'USA', 'tue')]]

In [20]:
input_rdd.getStorageLevel()

StorageLevel(False, False, False, False, 1)

In [21]:
input_rdd.persist( pyspark.StorageLevel(True,True,False,False,2) )

CoalescedRDD[5] at coalesce at NativeMethodAccessorImpl.java:0

In [22]:
input_rdd.getStorageLevel()

StorageLevel(True, True, False, False, 2)

In [24]:
## Setting the broadcast variable

bc_weekday = sc.broadcast(weekday)

In [25]:
bc_weekday.value

{'sun': 'Sunday', 'mon': 'Monday', 'tue': 'Tuesday'}

In [26]:
type( bc_weekday.value )

dict

### Replace the shorthand of weekdays in the RDD, to their full form, using broadcast variable created above
<hr>

In [27]:
input_rdd.collect()

[('James', 'Smith', 'USA', 'mon'),
 ('Michael', 'Rose', 'USA', 'tue'),
 ('Robert', 'Williams', 'USA', 'sun'),
 ('Maria', 'Jones', 'USA', 'tue')]

In [28]:
fullday_rdd = input_rdd.map( lambda x: (x[0], x[1], x[2], bc_weekday.value[x[3]] ) )

In [29]:
fullday_rdd.collect()

[('James', 'Smith', 'USA', 'Monday'),
 ('Michael', 'Rose', 'USA', 'Tuesday'),
 ('Robert', 'Williams', 'USA', 'Sunday'),
 ('Maria', 'Jones', 'USA', 'Tuesday')]

In [30]:
fullday_rdd.getStorageLevel()

StorageLevel(False, False, False, False, 1)

In [31]:
fullday_rdd.getNumPartitions()

4