# Lab 7 Spark Low-level APIs 

## Part 1. Basic Spark RDD Operations

In [1]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext 
import pyspark.sql.functions as F
from pyspark import SparkConf

conf = SparkConf().setAppName("Exp7_Guide")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/04 01:35:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame[key: string, value: string]

In [2]:
spark.range(10).rdd

MapPartitionsRDD[5] at javaToPython at NativeMethodAccessorImpl.java:0

In [3]:
spark.range(10).toDF("id").rdd.map(lambda row: row[0])

PythonRDD[12] at RDD at PythonRDD.scala:53

In [4]:
spark.range(10).rdd.toDF()

                                                                                

DataFrame[id: bigint]

In [5]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)
words.setName("myWords")
words.name() # myWords
words.take(12)

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple']

In [6]:

def startsWithS(individual):
  return individual.startswith("S")

words.filter(lambda word: startsWithS(word)).collect()


['Spark', 'Simple']

In [7]:
words2 = words.map(lambda word: (word, word[0], word.startswith("S")))
words2.filter(lambda record: record[2]).take(5)
# words2.take(5)

[('Spark', 'S', True), ('Simple', 'S', True)]

In [8]:
words.flatMap(lambda word: list(word)).take(5)

['S', 'p', 'a', 'r', 'k']

In [9]:
words.flatMap(lambda word: (word)).take(5)

['S', 'p', 'a', 'r', 'k']

In [10]:
tmp = words2.filter(lambda record: record[2])
tmp.take(2)

[('Spark', 'S', True), ('Simple', 'S', True)]

In [11]:
tmp.flatMap(lambda word: list(word)).take(10)

['Spark', 'S', True, 'Simple', 'S', True]

In [12]:
tmp.flatMap(lambda word: word).take(10)


['Spark', 'S', True, 'Simple', 'S', True]

In [13]:
words.sortBy(lambda word: len(word) * -1).take(2)

['Definitive', 'Processing']

In [14]:
fiftyFiftySplit = words.randomSplit([0.5, 0.5])

In [15]:
spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x, y: x + y) # 210

210

In [16]:
def wordLengthReducer(leftWord, rightWord):
  if len(leftWord) > len(rightWord):
    return leftWord
  else:
    return rightWord
words.reduce(wordLengthReducer)

'Processing'

In [17]:
words.take(15)

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple']

In [18]:
words.getStorageLevel()

StorageLevel(False, False, False, False, 1)

In [19]:
words.mapPartitions(lambda part: [1]).sum() # 2

2

In [20]:
def indexedFunc(partitionIndex, withinPartIterator):
  return ["partition: {} => {}".format(partitionIndex,
    x) for x in withinPartIterator]
words.mapPartitionsWithIndex(indexedFunc).collect()

['partition: 0 => Spark',
 'partition: 0 => The',
 'partition: 0 => Definitive',
 'partition: 0 => Guide',
 'partition: 0 => :',
 'partition: 1 => Big',
 'partition: 1 => Data',
 'partition: 1 => Processing',
 'partition: 1 => Made',
 'partition: 1 => Simple']

In [21]:
spark.sparkContext.parallelize(["Hello", "World"], 2).glom().collect()

[['Hello'], ['World']]

## Part 2. Advanced Spark key-value RDD Operations

In [22]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)
words.collect()

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple']

In [23]:
words.map(lambda word: (word.lower(), 1)).collect()

[('spark', 1),
 ('the', 1),
 ('definitive', 1),
 ('guide', 1),
 (':', 1),
 ('big', 1),
 ('data', 1),
 ('processing', 1),
 ('made', 1),
 ('simple', 1)]

In [24]:
keyword = words.keyBy(lambda word: word.lower()[0])
keyword.collect()

[('s', 'Spark'),
 ('t', 'The'),
 ('d', 'Definitive'),
 ('g', 'Guide'),
 (':', ':'),
 ('b', 'Big'),
 ('d', 'Data'),
 ('p', 'Processing'),
 ('m', 'Made'),
 ('s', 'Simple')]

In [25]:
keyword.mapValues(lambda word: word.upper()).collect()

[('s', 'SPARK'),
 ('t', 'THE'),
 ('d', 'DEFINITIVE'),
 ('g', 'GUIDE'),
 (':', ':'),
 ('b', 'BIG'),
 ('d', 'DATA'),
 ('p', 'PROCESSING'),
 ('m', 'MADE'),
 ('s', 'SIMPLE')]

In [26]:
keyword.flatMapValues(lambda word: word.upper()).collect()

[('s', 'S'),
 ('s', 'P'),
 ('s', 'A'),
 ('s', 'R'),
 ('s', 'K'),
 ('t', 'T'),
 ('t', 'H'),
 ('t', 'E'),
 ('d', 'D'),
 ('d', 'E'),
 ('d', 'F'),
 ('d', 'I'),
 ('d', 'N'),
 ('d', 'I'),
 ('d', 'T'),
 ('d', 'I'),
 ('d', 'V'),
 ('d', 'E'),
 ('g', 'G'),
 ('g', 'U'),
 ('g', 'I'),
 ('g', 'D'),
 ('g', 'E'),
 (':', ':'),
 ('b', 'B'),
 ('b', 'I'),
 ('b', 'G'),
 ('d', 'D'),
 ('d', 'A'),
 ('d', 'T'),
 ('d', 'A'),
 ('p', 'P'),
 ('p', 'R'),
 ('p', 'O'),
 ('p', 'C'),
 ('p', 'E'),
 ('p', 'S'),
 ('p', 'S'),
 ('p', 'I'),
 ('p', 'N'),
 ('p', 'G'),
 ('m', 'M'),
 ('m', 'A'),
 ('m', 'D'),
 ('m', 'E'),
 ('s', 'S'),
 ('s', 'I'),
 ('s', 'M'),
 ('s', 'P'),
 ('s', 'L'),
 ('s', 'E')]

In [27]:
keyword.keys().collect()
keyword.values().collect()

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple']

In [28]:
print(words.collect())

['Spark', 'The', 'Definitive', 'Guide', ':', 'Big', 'Data', 'Processing', 'Made', 'Simple']


In [29]:
import random
distinctChars = words.flatMap(lambda word: list(word.lower())).distinct().collect()
sampleMap = dict(map(lambda c: (c, random.random()), distinctChars))
sampleMap

{'p': 0.4089615009326121,
 't': 0.17393092817098388,
 'd': 0.7003901218544413,
 'g': 0.29309422287517994,
 'b': 0.6885177680408501,
 'o': 0.9980665744148226,
 'c': 0.15867419426356688,
 'l': 0.3228691950159117,
 's': 0.5495345503631938,
 'a': 0.6159079273538373,
 'r': 0.9605775307166814,
 'k': 0.30016829821267343,
 'h': 0.40549778302363704,
 'e': 0.76231133041526,
 'f': 0.1763391907764913,
 'i': 0.08528855230787413,
 'n': 0.7929297562725831,
 'v': 0.9339393863845421,
 'u': 0.2210212822338966,
 ':': 0.20080112351527657,
 'm': 0.3978069624829508}

In [30]:
words.map(lambda word: (word.lower()[0], word)).sampleByKey(True, sampleMap,10).collect()

[('g', 'Guide'), ('g', 'Guide'), (':', ':'), ('d', 'Data')]

In [31]:
chars = words.flatMap(lambda word: word.lower())
KVcharacters = chars.map(lambda letter: (letter, 1))

def maxFunc(left, right):
  return max(left, right)
def addFunc(left, right):
  return left + right

In [32]:
KVcharacters.countByKey()

defaultdict(int,
            {'s': 4,
             'p': 3,
             'a': 4,
             'r': 2,
             'k': 1,
             't': 3,
             'h': 1,
             'e': 7,
             'd': 4,
             'f': 1,
             'i': 7,
             'n': 2,
             'v': 1,
             'g': 3,
             'u': 1,
             ':': 1,
             'b': 1,
             'o': 1,
             'c': 1,
             'm': 2,
             'l': 1})

In [58]:
KVcharacters.groupByKey().collect()

[('p', <pyspark.resultiterable.ResultIterable at 0x7f002605f510>),
 ('t', <pyspark.resultiterable.ResultIterable at 0x7f002605ff50>),
 ('d', <pyspark.resultiterable.ResultIterable at 0x7f002605da90>),
 ('g', <pyspark.resultiterable.ResultIterable at 0x7f002605e050>),
 ('b', <pyspark.resultiterable.ResultIterable at 0x7f002605e590>),
 ('o', <pyspark.resultiterable.ResultIterable at 0x7f002605c490>),
 ('c', <pyspark.resultiterable.ResultIterable at 0x7f002605c810>),
 ('l', <pyspark.resultiterable.ResultIterable at 0x7f002606d950>),
 ('s', <pyspark.resultiterable.ResultIterable at 0x7f002d3957d0>),
 ('a', <pyspark.resultiterable.ResultIterable at 0x7f00261be450>),
 ('r', <pyspark.resultiterable.ResultIterable at 0x7f002605c290>),
 ('k', <pyspark.resultiterable.ResultIterable at 0x7f002606fd50>),
 ('h', <pyspark.resultiterable.ResultIterable at 0x7f002605e4d0>),
 ('e', <pyspark.resultiterable.ResultIterable at 0x7f0025f08210>),
 ('f', <pyspark.resultiterable.ResultIterable at 0x7f0025f0831

In [33]:
from functools import reduce 
KVcharacters.groupByKey().map(lambda row: (row[0], reduce(addFunc, row[1]))).collect()

[('p', 3),
 ('t', 3),
 ('d', 4),
 ('g', 3),
 ('b', 1),
 ('o', 1),
 ('c', 1),
 ('l', 1),
 ('s', 4),
 ('a', 4),
 ('r', 2),
 ('k', 1),
 ('h', 1),
 ('e', 7),
 ('f', 1),
 ('i', 7),
 ('n', 2),
 ('v', 1),
 ('u', 1),
 (':', 1),
 ('m', 2)]

In [34]:

nums = sc.parallelize(range(1,31), 5)
nums.aggregate(0, maxFunc, addFunc)

90

In [35]:
depth = 3
nums.treeAggregate(0, maxFunc, addFunc, depth)

90

In [36]:
KVcharacters.aggregateByKey(0, addFunc, maxFunc).collect()

[('p', 2),
 ('t', 2),
 ('d', 2),
 ('g', 2),
 ('b', 1),
 ('o', 1),
 ('c', 1),
 ('l', 1),
 ('s', 3),
 ('a', 3),
 ('r', 1),
 ('k', 1),
 ('h', 1),
 ('e', 4),
 ('f', 1),
 ('i', 4),
 ('n', 1),
 ('v', 1),
 ('u', 1),
 (':', 1),
 ('m', 2)]

In [37]:
def valToCombiner(value):
  return [value]
def mergeValuesFunc(vals, valToAppend):
  vals.append(valToAppend)
  return vals
def mergeCombinerFunc(vals1, vals2):
  return vals1 + vals2

outputPartitions = 6
KVcharacters.combineByKey(valToCombiner,mergeValuesFunc,mergeCombinerFunc,outputPartitions).collect()

[('p', [1, 1, 1]),
 ('b', [1]),
 ('l', [1]),
 ('a', [1, 1, 1, 1]),
 ('k', [1]),
 ('h', [1]),
 ('i', [1, 1, 1, 1, 1, 1, 1]),
 ('u', [1]),
 ('t', [1, 1, 1]),
 ('d', [1, 1, 1, 1]),
 ('g', [1, 1, 1]),
 ('o', [1]),
 ('s', [1, 1, 1, 1]),
 ('r', [1, 1]),
 ('f', [1]),
 ('v', [1]),
 ('c', [1]),
 ('e', [1, 1, 1, 1, 1, 1, 1]),
 ('n', [1, 1]),
 (':', [1]),
 ('m', [1, 1])]

In [61]:
charRDD2.collect()

[('p', 0.878558772790426),
 ('t', 0.19628106892390296),
 ('d', 0.04543389416423482),
 ('g', 0.8934469163617617),
 ('b', 0.7672533295435618),
 ('o', 0.9600161772791295),
 ('c', 0.8786052957364385),
 ('l', 0.010514296073009244),
 ('s', 0.6236675437073809),
 ('a', 0.039234786964737145),
 ('r', 0.17212513109472938),
 ('k', 0.15105657226611224),
 ('h', 0.7548283330760189),
 ('e', 0.5612347709480946),
 ('f', 0.7168667920051603),
 ('i', 0.5986491782303409),
 ('n', 0.5394281726066129),
 ('v', 0.2021713781718958),
 ('u', 0.7473018187584111),
 (':', 0.16331919547689666),
 ('m', 0.28570959651037997)]

In [39]:
import random
distinctChars = words.flatMap(lambda word: word.lower()).distinct()
charRDD = distinctChars.map(lambda c: (c, random.random()))
charRDD2 = distinctChars.map(lambda c: (c, random.random()))
charRDD.cogroup(charRDD2).take(5)

[('p',
  (<pyspark.resultiterable.ResultIterable at 0x7f0026072190>,
   <pyspark.resultiterable.ResultIterable at 0x7f0026072210>)),
 ('t',
  (<pyspark.resultiterable.ResultIterable at 0x7f0026071b90>,
   <pyspark.resultiterable.ResultIterable at 0x7f0026071e10>)),
 ('d',
  (<pyspark.resultiterable.ResultIterable at 0x7f0026072410>,
   <pyspark.resultiterable.ResultIterable at 0x7f0026070750>)),
 ('g',
  (<pyspark.resultiterable.ResultIterable at 0x7f00260720d0>,
   <pyspark.resultiterable.ResultIterable at 0x7f0026072390>)),
 ('l',
  (<pyspark.resultiterable.ResultIterable at 0x7f00260705d0>,
   <pyspark.resultiterable.ResultIterable at 0x7f00260708d0>))]

In [40]:
keyedChars = distinctChars.map(lambda c: (c, random.random()))
outputPartitions = 10
KVcharacters.join(keyedChars).count()
KVcharacters.join(keyedChars, outputPartitions).count()

51

In [41]:
numRange = sc.parallelize(range(10), 2)
words.zip(numRange).collect()

[('Spark', 0),
 ('The', 1),
 ('Definitive', 2),
 ('Guide', 3),
 (':', 4),
 ('Big', 5),
 ('Data', 6),
 ('Processing', 7),
 ('Made', 8),
 ('Simple', 9)]

In [42]:
words.coalesce(1).getNumPartitions() # 1

1

In [43]:
df = spark.read.option("header", "true").option("inferSchema", "true")\
  .csv("/shareddata/data/retail-data/by-day/2011-03-24.csv")
rdd = df.coalesce(10).rdd

In [44]:
def partitionFunc(key):
  import random
  if key == 17850 or key == 12583:
    return 0
  else:
    return random.randint(1,2)


keyedRDD = rdd.keyBy(lambda row: row[6])
keyedRDD\
  .partitionBy(3, partitionFunc)\
  .map(lambda x: x[0])\
  .glom()\
  .map(lambda x: len(set(x)))\
  .take(5)


[0, 65, 66]

## Part 3. Spark Distributed Shared Variables

In [45]:
my_collection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
words = spark.sparkContext.parallelize(my_collection, 2)
supplementalData = {"Spark":1000, "Definitive":200,
                    "Big":-300, "Simple":100}
suppBroadcast = spark.sparkContext.broadcast(supplementalData)

In [46]:
suppBroadcast.value

{'Spark': 1000, 'Definitive': 200, 'Big': -300, 'Simple': 100}

In [63]:
words.map(lambda word: (word, suppBroadcast.value.get(word, 0)))\
  .sortBy(lambda wordPair: wordPair[1])\
  .collect()

[('Big', -300),
 ('The', 0),
 ('Guide', 0),
 (':', 0),
 ('Data', 0),
 ('Processing', 0),
 ('Made', 0),
 ('Simple', 100),
 ('Definitive', 200),
 ('Spark', 1000)]

In [48]:
schema = "DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count INT"
flights = spark.read.csv("/shareddata/data/flight-data/csv/2010-summary.csv", schema=schema, header=True)
accChina = spark.sparkContext.accumulator(0)

In [49]:
def accChinaFunc(flight_row):
  destination = flight_row["DEST_COUNTRY_NAME"]
  origin = flight_row["ORIGIN_COUNTRY_NAME"]
  if destination == "China":
    accChina.add(flight_row["count"])
  if origin == "China":
    accChina.add(flight_row["count"])

flights.foreach(lambda flight_row: accChinaFunc(flight_row))

In [50]:
accChina.value

953

## Part 4. Spark Configurations and Persist

In [51]:
from pyspark.sql.functions import col
df = spark.range(1 * 10000000).toDF("id").withColumn("square", col("id") * col("id"))
df.cache().count()

df.count()



                                                                                

10000000

In [52]:

df.unpersist() 
from pyspark import StorageLevel

df2 = spark.range(1 * 10000000).toDF("id").withColumn("square", col("id") * col("id"))
df2.persist(StorageLevel.DISK_ONLY).count()

                                                                                

10000000

In [53]:
df2.count()

10000000

In [54]:
df2.unpersist()

DataFrame[id: bigint, square: bigint]

In [55]:
df.createOrReplaceTempView("dfTable")
spark.sql("CACHE TABLE dfTable")

                                                                                

DataFrame[]

In [56]:
spark.sql("SELECT count(*) FROM dfTable").show()

+--------+
|count(1)|
+--------+
|10000000|
+--------+



# END
Thank you