In [40]:
!pip install findspark
!pip install pyspark
import findspark
import pyspark
findspark.init()
sc =  pyspark.SparkContext.getOrCreate()



In [41]:
pyspark.__version__

'2.4.5'

In [42]:
logFile = sc.textFile("notebook.log")

In [43]:
logFile.take(2)

['[I 12:09:13.491 NotebookApp] Using MathJax: /static/vendor/MathJax-2.5-latest/MathJax.js',
 "[I 12:09:13.494 NotebookApp] Using existing profile dir: u'/home/notebook/.ipython/profile_default'"]

In [44]:
info = logFile.filter(lambda line: "INFO" in line) #filter out the lines that contains INFO

In [45]:
info.take(2)

['15/10/14 14:29:21 INFO SparkContext: Running Spark version 1.4.1',
 '15/10/14 14:29:22 INFO SecurityManager: Changing view acls to: notebook']

In [46]:
info.count() # Count the lines:

13438

In [47]:
info.filter(lambda line: "spark" in line).count() #Count the lines with "spark" in it by combining transformation and action.

156

In [48]:
sp = info.filter(lambda line: "spark" in line)

In [49]:
# import numpy as np
# np.array(sp.take(3), dtype = 'object')

In [50]:
# info.filter(lambda line: "spark" in line).collect()

In [51]:
# create RDDs
readmeFile = sc.textFile("README.md")
pomFile = sc.textFile("pom.xml")

In [52]:
print(readmeFile.filter(lambda line: "spark" in line).count())
print(pomFile.filter(lambda line: "spark" in line).count())

11
31


In [53]:
readmeFile.take(5)

['# Apache Spark',
 '',
 'Spark is a fast and general cluster computing system for Big Data. It provides',
 'high-level APIs in Scala, Java, Python, and R, and an optimized engine that',
 'supports general computation graphs for data analysis. It also supports a']

In [54]:
pomFile.take(3)

['<?xml version="1.0" encoding="UTF-8"?>',
 '<!--',
 '  ~ Licensed to the Apache Software Foundation (ASF) under one or more']

In [55]:
# WordCount on each RDD so that the results are (K,V) pairs of (word,count)

readmeCount = readmeFile.flatMap(lambda line: line.split()).map(lambda word: (word,1)).reduceByKey(lambda a,b: a+b)
pomCount = pomFile.flatMap(lambda line: line.split()).map(lambda word: (word,1)).reduceByKey(lambda a,b: a+b)

In [56]:
readmeCount.take(3)

[('#', 1), ('Apache', 1), ('Spark', 14)]

In [57]:
pomCount.take(4)

[('<?xml', 1), ('version="1.0"', 1), ('Apache', 2), ('more', 1)]

In [58]:
joined = readmeCount.join(pomCount) #The join function combines the two datasets (K,V) and (K,W) together and get (K, (V,W)).

In [59]:
joined.take(3)

[('Apache', (1, 2)), ('Spark', (14, 1)), ('is', (6, 2))]

In [60]:
joinedsum = joined.map(lambda k: (k[0], k[1][0] + k[1][1] ))

In [61]:
joinedsum.take(3)

[('Apache', 3), ('Spark', 15), ('is', 8)]

### Shared variables

Normally, when a function passed to a Spark operation (such as map or reduce) is executed on a remote cluster node, it works on separate copies of all the variables used in the function. These variables are copied to each machine, and no updates to the variables on the remote machine are propagated back to the driver program. Supporting general, read-write shared variables across tasks would be inefficient. However, Spark does provide two limited types of shared variables for two common usage patterns: broadcast variables and accumulators.

#### Broadcast variables
Broadcast variables are useful for when you have a large dataset that you want to use across all the worker nodes. A read-only variable is cached on each machine rather than shipping a copy of it with tasks.

Read more here: http://spark.apache.org/docs/latest/programming-guide.html#broadcast-variables

In [62]:
broadcastVar = sc.broadcast([1,2,3]) # creating a broadcast variable

In [63]:
broadcastVar.value # to get the value

[1, 2, 3]

#### Accumulators

Accumulators are variables that can only be added through an associative operation. It is used to implement counters and sum efficiently in parallel. Worker nodes update the value of driver program. A variable is accessible in the driver program, shared variable across all the worker nodes. Write only variable for worker nodes, used by worker node to update.

In [64]:
accum = sc.accumulator(10) # create accumulator variable

In [65]:
accum

Accumulator<id=1, value=10>

In [66]:
rdd = sc.parallelize([1,2,3,4]) 

In [67]:
rdd.collect()

[1, 2, 3, 4]

In [68]:
# funtion to add each integer value to the accumulator variable
def f(x):
    global accum
    accum += x

In [69]:
#iterate through each element of the rdd and apply the function f on it
# foreach: function applied to all elements of RDD.
rdd.foreach(f) 

In [70]:
accum.value

20

In [38]:
# Key-value pairs
pair = ('a', 'b')

In [39]:
print(pair[0])
print(pair[1])

a
b
