## 04-pyspark-rdd-actions.py

In [None]:
# 04-pyspark-rdd-actions.py
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PySparkExamples').getOrCreate()
data = [("Z", 1), ("A", 20), ("B", 30), ("C", 40), ("B", 30), ("B", 60)]

inputRDD = spark.sparkContext.parallelize(data)
listRDD = spark.sparkContext.parallelize([1, 2, 3, 4, 5, 3, 2])

# aggregate
seqOp = (lambda x, y: x + y)  # elementwise sequential operation function
combOp = (lambda x, y: x + y)  # tuplewise operation function

print (seqOp(100, 200), combOp(100, 300))
agg1 = listRDD.aggregate(0, seqOp, combOp)
print(agg1, type(agg1)) # output 20

300 400
20 <class 'int'>


In [None]:
# aggregate 2
seqOp2 = (lambda x, y: (x[0] + y, x[1] + 1))
combOp2 = (lambda x, y: (x[0] + y[0], x[1] + y[1]))
agg2 = listRDD.aggregate((0, 0), seqOp2, combOp2)
print(agg2) # output (20, 7)

agg2 = listRDD.treeAggregate(0, seqOp, combOp)
print(agg2, type(agg2)) # output 20
# check the detailed calculation here: 
# https://stackoverflow.com/questions/43270798/how-does-rdd-aggregate-action-work-i-e-how-to-understand-the-parameters

(20, 7)
20 <class 'int'>


In [None]:
# fold
from operator import add
foldRes = listRDD.fold(0, add)
print(foldRes) # output 20

# reduce
redRes = listRDD.reduce(add)
print(redRes) # output 20

# treeReduce. This is similar to reduce
add = lambda x, y: x + y
redRes = listRDD.treeReduce(add)
print(redRes) # output 20

# Collect
data = listRDD.collect()
print(data)

20
20
20
[1, 2, 3, 4, 5, 3, 2]


In [None]:
# count, countApprox, countApproxDistinct
print("Count: " + str(listRDD.count()))
print("countApprox: " + str(listRDD.countApprox(1200)))
print("countApproxDistinct: " + str(listRDD.countApproxDistinct()))
print("countApproxDistinct: " + str(inputRDD.countApproxDistinct()))

# countByValue
print("countByValue: " + str(listRDD.countByValue()))

Count: 7
countApprox: 7
countApproxDistinct: 5
countApproxDistinct: 5
countByValue: defaultdict(<class 'int'>, {1: 1, 2: 2, 3: 2, 4: 1, 5: 1})


In [None]:
# first
print("first: " + str(listRDD.first()))
print("first:  " + str(inputRDD.first()))

# top
print("top: " + str(listRDD.top(2)))
print("top: " + str(inputRDD.top(2)))

first: 1
first:  ('Z', 1)
top: [5, 4]
top: [('Z', 1), ('C', 40)]


In [None]:
# min
print("min: " + str(listRDD.min()))
print("min: " + str(inputRDD.min()))

# max
print("max: " + str(listRDD.max()))
print("max: " + str(inputRDD.max()))

min: 1
min: ('A', 20)
max: 5
max: ('Z', 1)


In [None]:
# take, takeOrdered, takeSample
print("take: " + str(listRDD.take(2)))
# Output: take : 1,2
print("takeOrdered: " + str(listRDD.takeOrdered(2)))
# Output: takeOrdered : 1,2
print("take: " + str(listRDD.takeSample(10, 2)))

take: [1, 2]
takeOrdered: [1, 2]
take: [3, 2]
