# PySpark: Fundamentals
SparkContext is an entry point to spark world. <br/>
It ensures connecting with cluster. 

In [7]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
import warnings

In [59]:
sc= SparkContext(appName="SDDM", master="local[*]")
sc= SparkContext.getOrCreate()
sqlContext= SQLContext(sc)
# sc.stop()   #when we are done. Never forget it !!!

In [14]:
print("Version of SparkContext: {}".format(sc.version))
print("Python version of Sparkcontext: {}".format(sc.pythonVer))
print("URL of the cluster or local mode of cluster: {}".format(sc.master))

Version of SparkContext: 2.4.4
Python version of Sparkcontext: 3.7
URL of the cluster or local mode of cluster: local[*]


### RDDs

Creating RDDs from python:

In [16]:
numRDD= sc.parallelize([1,2,3,4])
helloRDD= sc.parallelize("hello spark")
print(numRDD)
print(helloRDD)

ParallelCollectionRDD[2] at parallelize at PythonRDD.scala:195
ParallelCollectionRDD[3] at parallelize at PythonRDD.scala:195


In [18]:
print(type(numRDD))
print(type(helloRDD))

<class 'pyspark.rdd.RDD'>
<class 'pyspark.rdd.RDD'>


In [19]:
numRDD.collect()

[1, 2, 3, 4]

In [20]:
helloRDD.collect()

['h', 'e', 'l', 'l', 'o', ' ', 's', 'p', 'a', 'r', 'k']

In [27]:
rdd2= sc.textFile("/Users/ahmetemintek/Desktop/new_pyspark/example_text.txt")
rdd2.collect()

['YASAMAK SAKAYA GELMEZ,',
 'BÜYÜK BİR CİDDİYETLE YASAYACAKSIN',
 'BİR SİNCAP GİBİ MESELA,',
 'YANI, YASAMIN DIŞINDA VE ÖTESİNDE HİÇBİR ŞEY BEKLEMEDEN',
 'YANI, BÜTÜN İŞİN GÜCÜN YASAMAK OLACAK.']

In [28]:
rdd2.count() #returns line numbers

5

In [29]:
rdd2.collect()[:3]

['YASAMAK SAKAYA GELMEZ,',
 'BÜYÜK BİR CİDDİYETLE YASAYACAKSIN',
 'BİR SİNCAP GİBİ MESELA,']

In [33]:
lineLenght= rdd2.map(lambda x: len(x))
print(lineLenght.count())
print(lineLenght.collect())

5
[22, 33, 23, 55, 38]


In [39]:
rdd3= sc.wholeTextFiles("/Users/ahmetemintek/Desktop/new_pyspark", 5)
rdd3.keys().collect()

['file:/Users/ahmetemintek/Desktop/new_pyspark/example_text.txt',
 'file:/Users/ahmetemintek/Desktop/new_pyspark/Untitled.ipynb',
 'file:/Users/ahmetemintek/Desktop/new_pyspark/example_text2.txt']

### Functions 

Lambda functions are anonymous functions in python. Quite efective with map() and filter() functions.  

**Transformations (lazy evaluations):** <br/>
map() : this applies a function to ***all elements*** in RDD. 

In [41]:
rdd= sc.parallelize([1,2,3,4])
rdd_map= rdd.map(lambda x: x**2)
rdd_map.collect()

[1, 4, 9, 16]

filter() : THis transformation returns a new RDD  with elements that ***pass the condition***. 

In [43]:
rdd= sc.parallelize([1,2,3,4,5,6,7])
rdd_filter= rdd.filter(lambda x: x>3)
rdd_filter.collect()

[4, 5, 6, 7]

flatmap() : This returns multiple values for each element in the original RDD.

In [50]:
rdd= sc.parallelize(["hello world ", "how are you"])
rdd_flat= rdd.flatMap(lambda x: x.split(" "))
rdd_flat.collect()

['hello', 'world', '', 'how', 'are', 'you']

In [61]:
#union() transformation
rdd= sc.textFile("/Users/ahmetemintek/Desktop/new_pyspark/example_text.txt")
rdd_2= sc.textFile("/Users/ahmetemintek/Desktop/new_pyspark/example_text2.txt")

combining_rdd= rdd.union(rdd_2)
combining_rdd.collect()

['YASAMAK SAKAYA GELMEZ,',
 'BÜYÜK BİR CİDDİYETLE YASAYACAKSIN',
 'BİR SİNCAP GİBİ MESELA,',
 'YANI, YASAMIN DIŞINDA VE ÖTESİNDE HİÇBİR ŞEY BEKLEMEDEN',
 'YANI, BÜTÜN İŞİN GÜCÜN YASAMAK OLACAK.',
 'Hello new text file']

**RDD Actions:** this operation returns a value after running a computation on the RDD. 

In [62]:
combining_rdd.take(3)

['YASAMAK SAKAYA GELMEZ,',
 'BÜYÜK BİR CİDDİYETLE YASAYACAKSIN',
 'BİR SİNCAP GİBİ MESELA,']

first() top() and count() actions.  <br/>
first() prints the first element of RDD

In [63]:
combining_rdd.first()

'YASAMAK SAKAYA GELMEZ,'

In [64]:
combining_rdd.collect()[-1]

'Hello new text file'

top(): <br/>
Take top elements <br/>
This method should only be used if the resulting array is expected

In [67]:
rdd_top= sc.parallelize([1,2,10,16,3, 6])
rdd_top.top(3)  # it returns list sorted in descended order. 

[16, 10, 6]

In [68]:
# count() returns the number of the elements in the RDD
combining_rdd.count()

6

In [69]:
#example: 
num_rdd= sc.parallelize([1,2,3,4])
cube_rdd= num_rdd.map(lambda x: x**3)
cubes= cube_rdd.collect()

for num in cubes:
    print(num)

1
8
27
64


In [75]:
rdd1= sc.parallelize(range(10))
rdd1.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

### Pair RDDs
Pair RDDs special kind of key/value type data.  <br/>
Key is the identifier and value is data <br/>

Creating Pair RDDs (there are 2 ways):

- From a list of key/value tuple
- From a regular rdd


In [78]:
#creating from tuple
my_tuple= [("ronaldo", 22), ("messi", 24), ("neymar", 26)]

pairRDD_tuple= sc.parallelize(my_tuple)
pairRDD_tuple.collect()

[('ronaldo', 22), ('messi', 24), ('neymar', 26)]

In [79]:
#creating from list
my_list= ["ronaldo 24", "messi 25", "neymar 26"]

regularRDD= sc.parallelize(my_list)

pairRDD_list= regularRDD.map(lambda x: (x.split(" ")[0], x.split(" ")[1]))
pairRDD_list.collect()

[('ronaldo', '24'), ('messi', '25'), ('neymar', '26')]

In [81]:
#Fetching values from pair rdd
pair_values= pairRDD_tuple.values()
pair_values.collect()

[22, 24, 26]

In [82]:
pair_keys= pairRDD_tuple.keys()
pair_keys.collect()

['ronaldo', 'messi', 'neymar']

**Transformations On Pair RDDs:** <br/>

All regular transformers work on pair rdd. <br/>

Examples of pair rdd transformations: <br/>



reduceByKey(func): Combine values with the same key <br/>
groupByKey(): Group values with the same key <br/>
sortByKey(): Return an RDD sorted by the key <br/>
join(): Join two pair RDDs based on their key <br/>

In [84]:
# we can use user funstions to map on RDD

def get_squares(num):
    return num**2

rdd= sc.parallelize([1,2,3,4,5,1,2])
rdd.map(get_squares).collect()

[1, 4, 9, 16, 25, 1, 4]

In [86]:
#finding the distinct numbers
rdd.distinct().collect()

[1, 2, 3, 4, 5]

In [87]:
#subtract
numRDD2 = sc.parallelize([1,2,3])
rdd.subtract(numRDD2).collect()

[4, 5]

In [88]:
#intersection
rdd.intersection(numRDD2).collect()

[1, 2, 3]

In [91]:
#calculating basic stats
rdd= sc.parallelize([1,2,3,4,5,6,7])

print(rdd.min())
print(rdd.max())
print(rdd.mean())
print(rdd.variance())
print(rdd.stdev())
print(rdd.stats())
print(rdd.stats().asDict())

1
7
4.0
4.0
2.0
(count: 7, mean: 4.0, stdev: 2.0, max: 7, min: 1)
{'count': 7, 'mean': 4.0, 'sum': 28.0, 'min': 1, 'max': 7, 'stdev': 2.160246899469287, 'variance': 4.666666666666667}


In [92]:
#reduceByKey() transformation combines values with the same key. 
#It is transformation, not action

rdd= sc.parallelize([("ronaldo", 22), ("messi", 24), ("neymar", 25), ("messi", 27)])
reduce_rdd= rdd.reduceByKey(lambda x,y: x+y)
reduce_rdd.collect()

[('messi', 51), ('neymar', 25), ('ronaldo', 22)]

In [95]:
#sortByKey() transformation orders RDD by key
rdd_new= reduce_rdd.map(lambda x: (x[1], x[0]))
rdd_new.sortByKey(ascending=True).collect()

[(22, 'ronaldo'), (25, 'neymar'), (51, 'messi')]

In [101]:
# groupbykey() groups all the values with the same key in the pair RDD
airports = [("US", "JFK"),("UK", "LHR"),("FR", "CDG"),("US", "SFO")]

rdd= sc.parallelize(airports)
rdd_group= rdd.groupByKey().collect()

for count, air in rdd_group:
    print(count, list(air))

UK ['LHR']
FR ['CDG']
US ['JFK', 'SFO']


In [102]:
# join() transformation joins the two pair RDDs based on their key
RDD1 = sc.parallelize([("Messi", 34),("Ronaldo", 32),("Neymar", 24)])

RDD2 = sc.parallelize([("Ronaldo", 80),("Neymar", 120),("Messi", 100)])

RDD1.join(RDD2).collect()

[('Messi', (34, 100)), ('Neymar', (24, 120)), ('Ronaldo', (32, 80))]

In [103]:
#reduce() action
# reduce(func) action is used for aggregating the elements of a regular RDD
x= [1,2,3,4]
rdd= sc.parallelize(x)
rdd.reduce(lambda x,y: x+y)

10

In [105]:
#countByKey action
# countByKey() only available for type (K, V)
# countByKey() action counts the number of elements for each key

rdd= sc.parallelize([("a", 1), ("b", 2), ("a", 3), ("a", 1)])
items= rdd.countByKey().items()

for key, value in items:
    print(key, value)

a 3
b 1


In [106]:
# collectAsMap() activation returns the key-value pairs in RDD as dictionary

sc.parallelize([(1, 3), (2, 6)]).collectAsMap()

{1: 3, 2: 6}

**Word Count Example**

In [108]:
text_file= sc.textFile("philly_text.txt")
counts_rdd= text_file.flatMap(lambda line: line.split(" "))\
                        .map(lambda word: (word, 1))\
                        .reduceByKey(lambda x, y: x+y)

#to print results
counts_rdd.map(lambda x: (x[1], x[0]))\
            .sortByKey(ascending=False)\
            .collect()[:10]

[(12, 'the'),
 (8, 'A'),
 (6, 'I'),
 (4, 'of'),
 (4, 'Philadelphia'),
 (4, 'from'),
 (4, 'To'),
 (4, 'to'),
 (3, 'am'),
 (3, 'Geordie')]