In [None]:
# First, create a Python list
wordList = ["doug", "jon", "sameer", "eliano", "richard"]

In [None]:
# Now, we'll create an RDD from the Python list with 4 partitions
wordsRDD = sc.parallelize(wordList, 4)

In [None]:
# Why won't this print anything?
def println(value):
   print(value)
    
wordsRDD.map(println)

In [None]:
# This will print something... but where?
wordsRDD.map(println).collect()
# Let's look in (executor) logs

In [None]:
# Now, we'll create an RDD from the array, requesting 6 partitions
wordsRDD = sc.parallelize(wordList, 6)

In [None]:
wordsRDD.collect()

In [None]:
sc

In [None]:
sc.version

In [None]:
sc.appName

In [None]:
sc.defaultParallelism

In [None]:
datapath = "/data/training/tom_sawyer.txt" # https://www.gutenberg.org/ebooks/74

In [None]:
sc.textFile(datapath, 2).collect()

In [None]:
# Note, collect does not return an RDD (it returns an Array)
ebook = sc.textFile(datapath, 2).collect()

In [None]:
# Why doesnt this work?
ebook.count()

In [None]:
print('type of ebook: {0}'.format(type(ebook)))

In [None]:
# Why doesnt this show up in the UI
ebookRDD = sc.textFile(datapath, 2)

In [None]:
ebookRDD.count()

In [None]:
ebookRDD.getNumPartitions()

In [None]:
print('type of ebookRDD: {0}'.format(type(ebookRDD)))

In [None]:
ebookRDD

In [None]:
# Let's use RDD.glom to look at the distribution of data in our partitions

# What does glom do? Replaces each partition in the RDD with a single container that contains all of the partition's previous items
# http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.glom

glomDemo = sc.parallelize([1,2,3,4,5,6,7,8,9], 3)
glomDemo.glom().collect()

# How does it work? This is a good one to guess about, and then check the source

In [None]:
ebookRDD.glom().collect()

In [None]:
# Count the number of items in each partition.
for (p, i) in ebookRDD.glom().zipWithIndex().collect():
  print('%d: %d items(s)' % (i, len(p)))

In [None]:
sc.textFile(datapath, 2).flatMap(lambda x: x.split(' ')).collect()

In [None]:
sc.textFile(datapath, 2).flatMap(lambda x: x.split(' ')).map(lambda s: (s, 1)).collect()

In [None]:
sc.textFile(datapath, 2).flatMap(lambda x: x.split(' ')).map(lambda s: (s, 1)).reduceByKey(lambda x, y: x + y).collect()

In [None]:
# why doesnt this show up?
cachedBookRDD = sc.textFile(datapath, 2).cache()

In [None]:
cachedBookRDD.setName("My Cached Book") 

In [None]:
cachedBookRDD.count()

In [None]:
cachedBookRDD.unpersist()

In [None]:
from pyspark.storagelevel import StorageLevel
cachedBookSerializedRDD = sc.textFile(datapath, 2).persist(StorageLevel.MEMORY_ONLY_SER) #same thing as above, since we're using Python

In [None]:
cachedBookSerializedRDD.setName("My Serialized Cached Book")

In [None]:
cachedBookSerializedRDD.count()

In [None]:
sc.textFile(datapath, 2).flatMap(lambda x: x.split(' ')).map(lambda s: (s, 1)).reduceByKey(lambda x, y: x + y).collect()

In [None]:
tempRDD = sc.textFile(datapath, 2).repartition(4)

In [None]:
tempRDD.getNumPartitions()

In [None]:
# It simply counts the number of items in each partition.
for (p, i) in tempRDD.glom().zipWithIndex().collect():
  print('%d: %d items(s)' % (i, len(p)))

In [None]:
sc.textFile(datapath, 2).repartition(12).cache().flatMap(lambda x: x.split(' ')).map(lambda s: (s, 1)).reduceByKey(lambda x, y: x + y).collect()

In [None]:
sc.textFile(datapath, 2).repartition(4).cache().flatMap(lambda x: x.split(' ')).map(lambda s: (s, 1)).reduceByKey(lambda x, y: x + y).collect()

In [None]:
myRDD = sc.textFile(datapath, 2).repartition(12).cache().flatMap(lambda x: x.split(' ')).map(lambda s: (s, 1)).reduceByKey(lambda x, y: x + y, numPartitions=8)
myRDD.collect()

In [None]:
myRDD.getNumPartitions()

In [None]:
myRDD.count() 
# this job uses data that is downstream from a shuffle we recently did -- Spark should use those shuffle output files to take a shortcut
# let's look in the UI for skipped stages