In [1]:
from pyspark import SparkContext
from pyspark import SparkConf

In [2]:
sc = SparkContext()

## Chapter 3

In [3]:
inputRDD = sc.textFile("./log.txt")
errorsRDD = inputRDD.filter(lambda x: "error"in x)
warningsRDD = inputRDD.filter(lambda x: "warning" in x)
badLineRDD = errorsRDD.union(warningsRDD)
# a better way is rdd.filter(lambda x: "error" in x or "warning" in x) rather than using rdd.union

In [4]:
print("Input had", badLineRDD.count(), "concerning lines")
print("Here are examples:")
for line in badLineRDD.take(5):
    print(line)

Input had 2 concerning lines
Here are examples:
The application closes the API session, resulting in an error being reported because the state of the flow is SESSIONED. This error can be ignored.


In [7]:
class SearchFunctions(object):
    def __init__(self,query):
        self.query = query
    
    def isMatch(self,s):
        return self.query in s
    
    def getMatchNoReference(self,rdd):
        query = self.query
        return rdd.filter(lambda x: query in x)

In [8]:
nums = sc.parallelize([1,2,3,4])
squared = nums.map(lambda x: x*x).collect()
for num in squared:
    print("%i" % (num))

1
4
9
16


In [9]:
lines = sc.parallelize(["hello world", "hi"])
words = lines.flatMap(lambda line: line.split(" "))
words.first()

'hello'

In [10]:
nums2 = sc.parallelize([1,2,3,4,5,6,7,8])
sum_ = nums2.reduce(lambda x,y: x+y)

In [12]:
sumCount = nums.aggregate((0,0),
                          (lambda acc,value: (acc[0] + value, acc[1] + 1)),
                          (lambda acc1,acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])))

In [14]:
sumCount[0] / float(sumCount[1])

2.5

## Chapter 4

In [15]:
pairs = lines.map(lambda x: (x.split(" ")[0], x))

In [16]:
pairs.collect()

[('hello', 'hello world'), ('hi', 'hi')]

In [17]:
results  = pairs.filter(lambda keyValue: len(keyValue[1]) < 20)

In [19]:
rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])
rddreduce = rdd.mapValues(lambda x: (x,1)).reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))

In [20]:
rddreduce.collect()

[(1, (2, 1)), (3, (10, 2))]

In [22]:
rdd = sc.textFile("./log.txt")
words = rdd.flatMap(lambda x: x.split(" "))
results = words.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)

In [24]:
results.take(3)

[('03/22', 284), ('INFO', 147), ('', 954)]

In [39]:
nums = sc.parallelize([('k',2),('k',4),('r',9),('a',5),('t',2),('t',7),('t',3)])
sumCount = nums.combineByKey((lambda x: (x,1)),
                            (lambda x, y: (x[0] + y, x[1] + 1)),
                            (lambda x, y: (x[0] + y[0], x[1] + y[1])))
avg2 = sumCount.map(lambda x: (x[0], x[1][0]/x[1][1]))

In [40]:
avg2.collect()

[('k', 3.0), ('t', 4.0), ('a', 5.0), ('r', 9.0)]

In [41]:
avg2.collectAsMap()

{'k': 3.0, 't': 4.0, 'a': 5.0, 'r': 9.0}

In [43]:
data = [("a", 3), ("b", 4), ("a", 1)]
par_default = sc.parallelize(data).reduceByKey(lambda x, y: x + y) # Default parallelism 
par_customize = sc.parallelize(data).reduceByKey(lambda x, y: x + y, 10) # Custom parallelism


In [44]:
par_default.getNumPartitions()

12

In [45]:
par_customize.getNumPartitions()

10

## Chapter 5