In [1]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
from pyspark.rdd import RDD

In [2]:
conf = SparkConf().setMaster('local').setAppName('sample app')
sc = SparkContext(conf = conf)
sc

#sc.stop() 

In [250]:
# Create RDD using parallelize
lines = sc.parallelize(['pandas', 'i like pandas', 'panda'])
rdd2 = sc.parallelize(['coffee', 'pandas', 'party'])

def filterdata(data):
    print(data)
    return 'pandas' in data


#Check instance
isinstance(lines, RDD)

listrdd = sc.parallelize(list({("maths", 50),
                              ("maths", 60),
                              ("english", 65),
                              ("physics", 66), 
                              ("physics", 61), 
                              ("physics", 87)}), 
                         1)
print("\nList RDD: " , listrdd.collect())
newrdd = listrdd.flatMap(lambda x: x)
#print(newrdd.collect())

tupleRdd = sc.parallelize(tuple({("maths", 50),
                              ("maths", 60),
                              ("english", 65),
                              ("physics", 66), 
                              ("physics", 61), 
                              ("physics", 87)}), 1)

print("\nTuple RDD: ", tupleRdd.collect())

dictRdd = sc.parallelize(dict({"maths": 50,"maths": 60,"english": 65,"physics": 66,"physics": 61,"physics": 87}))
print("\nDict RDD: ", dictRdd.collect())


List RDD:  [('physics', 66), ('maths', 50), ('maths', 60), ('physics', 61), ('english', 65), ('physics', 87)]

Tuple RDD:  [('physics', 66), ('maths', 50), ('maths', 60), ('physics', 61), ('english', 65), ('physics', 87)]

Dict RDD:  ['maths', 'english', 'physics']


In [251]:
# Create RDD using external data
textfile = sc.textFile("C:\\Users\\EdwinVivekN\Desktop\logwork.txt")
word = textfile.filter(lambda x: 'Mongo' in x)

<h1>#Transformations

In [137]:
filteredlines = lines.filter(filterdata)
print('Filter: ', filteredlines.collect())

map = lines.map(lambda x: x + x)
print('\nMap: ', map.collect())

fmap = lines.flatMap(lambda x: x.split(' '))
print('\nFlat map: ', fmap.collect())

print('\nDistinct: ', lines.distinct().collect())

print('\nUnion: ',lines.union(rdd2).collect())
print('\nIntersction: ',lines.intersection(rdd2).collect())
print('\nSubtract: ',lines.subtract(rdd2).collect())
print('\nCartesian: ',lines.cartesian(rdd2).collect())

Filter:  ['pandas', 'i like pandas']

Map:  ['pandaspandas', 'i like pandasi like pandas', 'pandapanda']

Flat map:  ['pandas', 'i', 'like', 'pandas', 'panda']

Distinct:  ['pandas', 'i like pandas', 'panda']

Union:  ['pandas', 'i like pandas', 'panda', 'coffee', 'pandas', 'party']

Intersction:  ['pandas']

Subtract:  ['panda', 'i like pandas']

Cartesian:  [('pandas', 'coffee'), ('pandas', 'pandas'), ('pandas', 'party'), ('i like pandas', 'coffee'), ('i like pandas', 'pandas'), ('i like pandas', 'party'), ('panda', 'coffee'), ('panda', 'pandas'), ('panda', 'party')]


In [19]:
#sample(withReplacement, fraction, seed)
for w in word.sample(False, 0.5).collect():
    print(w)

Mongodb - testing mapreduce from local mongo server
MongoDB connection not reachable issue details shared
MongoDB OOM exception handling options
MongoDB customer meeting on live connection, data warehouse and other connection ways
MongoDB SSH code review
MongoDB SSH connection testing 


<h1>Actions

In [52]:
print('\nCount: ',word.count())
print('\nCount by value: ',word.filter(lambda x: 'Mongodb'  in x).countByValue())
print('\nCollect:', word.flatMap(lambda x: x[0]).collect())


Count:  14

Count by value:  defaultdict(<class 'int'>, {'Mongodb - fixed issues faced in mapreduce': 1, 'Mongodb - testing mapreduce from local mongo server': 1, 'Mongodb data conversion issue fixing': 1, 'Mongodb connection issue and unauthorized issue for customer - 323957': 1})

Count by key:  defaultdict(<class 'int'>, {'M': 11, 'C': 2, 'S': 1})

Collect: ['M', 'M', 'M', 'C', 'C', 'M', 'M', 'M', 'M', 'M', 'S', 'M', 'M', 'M']


In [51]:
print('\nTop:', word.top(2)) # sorted and descending (default)
print('\nFirst:', word.first()) #unsorted

print('\nTake:', word.take(2)) #unsorted
print('\nTake ordered:', word.takeOrdered(2))  # sorted and ascending (default)
print('\nTake sample:', word.takeSample(False, 2)) # random


Top: ['SSH gathering details on MongoDB', 'Mongodb data conversion issue fixing']

First: Mongodb - fixed issues faced in mapreduce

Take: ['Mongodb - fixed issues faced in mapreduce', 'Mongodb - testing mapreduce from local mongo server']

Take ordered: ['Checking the feasibility to expand MongoDB arrays in Bold BI ', 'Customer meeting on MongoBD related queries']

Take sample: ['MongoDB dll mismatch issue in linux site', 'Checking the feasibility to expand MongoDB arrays in Bold BI ']


In [110]:
print('\nReduce:',lines.reduce(lambda x,y: x + ' ' + y)) # returns single result

print('\nFold:',lines.fold("X", lambda x,y: x + ' ' + y))  # 0 for +, 1 for *, or an empty list for concatenation

seqOp = (lambda x, y: x + ' ' + y)
combOp = (lambda x, y: x + y)
print('\nAggregate:', lines.aggregate(("X"), seqOp, combOp))



Reduce: pandas i like pandas panda

Fold: X X pandas i like pandas panda

Aggregate: XX pandas i like pandas panda


In [203]:
#Pair Aggregate
seqOp = (lambda x, y: (x[0] + y,  x[1] + 1))
combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1]))
pairagg = sc.parallelize([1, 2, 3, 4]).aggregate((0, 1), seqOp, combOp)
print(pairagg)

#agg on dict(string,int)
inputRDD = sc.parallelize([dict({"Z": 1,"A": 20,"B": 30,"C": 40,"B": 30,"B": 60})])
itemsRDD = inputRDD.flatMap(lambda x: x.items())
print("\ninput:", itemsRDD.collect())
seqop = lambda acc, value: (acc[0] + value[0], acc[1] + value[1])
combop = lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
result = itemsRDD.aggregate(("",0), seqop, combop)
print(result)         

#agg on list(string, int)
def sequenceOp(acc, value):
    accOp = acc[0] + " " + value[0]
    valueOp = acc[1] + value[1]
    return (accOp.upper(), valueOp)
print(listrdd.collect())
combop = lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
agg = listrdd.aggregate(("",0), sequenceOp, combop)
print(agg)

(10, 6)

input: [('Z', 1), ('A', 20), ('B', 60), ('C', 40)]
('ZABC', 121)
[('physics', 66), ('maths', 50), ('maths', 60), ('physics', 61), ('english', 65), ('physics', 87)]
(' PHYSICS MATHS MATHS PHYSICS ENGLISH PHYSICS', 389)


In [228]:
def printdata(x):
    print(x)

#foreach
word.foreach(lambda x: print(x))

#Iteration
iterator = word.toLocalIterator()
for i in iterator:
    pass
    #print(i)

<h1>Pair RDD Transformation

In [253]:
# Creating pair RDD
wordpair = word.map(lambda x: (x.split(" ")[0], x.split(" ")[1]))
wordpair.collect()

def splitfunc(word):
    key = word.split(" ")[0]
    value = word.split(" ")[1:]
    return [key, value] #returns list

wordpair2 = word.map(lambda x: (splitfunc(x)[0], splitfunc(x)[1]))
wordpair2.collect()


[('Mongodb', ['-', 'fixed', 'issues', 'faced', 'in', 'mapreduce']),
 ('Mongodb',
  ['-', 'testing', 'mapreduce', 'from', 'local', 'mongo', 'server']),
 ('MongoDB', ['-', 'mapreduce', 'testing', 'in', 'local']),
 ('Checking',
  ['the',
   'feasibility',
   'to',
   'expand',
   'MongoDB',
   'arrays',
   'in',
   'Bold',
   'BI',
   '']),
 ('Customer', ['meeting', 'on', 'MongoBD', 'related', 'queries']),
 ('MongoDB', ['connection', 'not', 'reachable', 'issue', 'details', 'shared']),
 ('MongoDB', ['dll', 'mismatch', 'issue', 'in', 'linux', 'site']),
 ('Mongodb', ['data', 'conversion', 'issue', 'fixing']),
 ('MongoDB', ['OOM', 'exception', 'handling', 'options']),
 ('Mongodb',
  ['connection',
   'issue',
   'and',
   'unauthorized',
   'issue',
   'for',
   'customer',
   '-',
   '323957']),
 ('SSH', ['gathering', 'details', 'on', 'MongoDB']),
 ('MongoDB',
  ['customer',
   'meeting',
   'on',
   'live',
   'connection,',
   'data',
   'warehouse',
   'and',
   'other',
   'connection',


In [6]:
print('\nMap Values:', wordpair.mapValues(lambda x: x + " new val").collect())
print('\nFlap Map Values:', wordpair.flatMapValues(lambda x: x[0:4]).reduceByKey(lambda x,y: x +" "+ y).collect())
print('\nKeys: ', wordpair.keys().collect())
print('\nValues: ', wordpair.values().collect())
print('\nReduce By Key: ', wordpair.reduceByKey(lambda x,y: x +" "+ y).collect())
print('\nFold By Key: ', wordpair.foldByKey("XX", lambda x,y: x +" "+ y).collect())
print('\nGroup By Key: ', wordpair.groupByKey().collect())
print('\nSort By Key: ', wordpair.sortByKey().collect())


Reduce By Key:  [('Mongodb', '- - data connection'), ('MongoDB', '- connection dll OOM customer SSH SSH'), ('Checking', 'the'), ('Customer', 'meeting'), ('SSH', 'gathering')]

Group By Key:  [('Mongodb', <pyspark.resultiterable.ResultIterable object at 0x04C28F10>), ('MongoDB', <pyspark.resultiterable.ResultIterable object at 0x04C28F90>), ('Checking', <pyspark.resultiterable.ResultIterable object at 0x04C70070>), ('Customer', <pyspark.resultiterable.ResultIterable object at 0x04C70050>), ('SSH', <pyspark.resultiterable.ResultIterable object at 0x04C70110>)]

Map Values: [('Mongodb', '- new val'), ('Mongodb', '- new val'), ('MongoDB', '- new val'), ('Checking', 'the new val'), ('Customer', 'meeting new val'), ('MongoDB', 'connection new val'), ('MongoDB', 'dll new val'), ('Mongodb', 'data new val'), ('MongoDB', 'OOM new val'), ('Mongodb', 'connection new val'), ('SSH', 'gathering new val'), ('MongoDB', 'customer new val'), ('MongoDB', 'SSH new val'), ('MongoDB', 'SSH new val')]

Flap 

In [242]:
#combineByKey
print(listrdd.collect())

combine = listrdd.combineByKey(lambda v: v, lambda v1,v2: v1+v2, lambda c1,c2: c1+c2)
print(combine.collect())

def createCombiner(v):   #initial values of each keys only are passed here, not keys
    return (v, 1)

def mergeValues(v1, v2):  #values are merged here
    return (v1[0] + v2, v1[1] + 1)

def mergeCombiners(c1, c2):  #values are combined here
    return (c1[0] + c2[0], c1[1]+ c2[1])
    

combine2 = listrdd.combineByKey(createCombiner, mergeValues, mergeCombiners)
print(combine2.collect())



[('physics', 66), ('maths', 50), ('maths', 60), ('physics', 61), ('english', 65), ('physics', 87)]
[('physics', 214), ('maths', 110), ('english', 65)]
[('physics', (214, 3)), ('maths', (110, 2)), ('english', (65, 1))]


In [268]:
#aggregateByKey
seqop = lambda acc, value: (acc + value)
combop = lambda acc1, acc2: (acc1 + acc2)  #acc1[0] + acc2[0], acc1[1] + acc2[1]
aggbykey = listrdd.aggregateByKey(0, seqop , combop)
print(aggbykey.collect())


[('physics', 214), ('maths', 110), ('english', 65)]


In [261]:
gbk = wordpair.groupByKey()
gbk.collect()



[('Mongodb', <pyspark.resultiterable.ResultIterable at 0xba56f90>),
 ('MongoDB', <pyspark.resultiterable.ResultIterable at 0xba56450>),
 ('Checking', <pyspark.resultiterable.ResultIterable at 0xba561f0>),
 ('Customer', <pyspark.resultiterable.ResultIterable at 0xba56430>),
 ('SSH', <pyspark.resultiterable.ResultIterable at 0xba562d0>)]

In [259]:
otherListrdd = sc.parallelize(list({("maths", 20),
                              ("maths", 100),
                              ("physics", 66), 
                              ("physics", 90)}), 
                         1)

print('\nSubtract By Key: ', listrdd.subtractByKey(otherListrdd).collect())

print('\nJoin: ', listrdd.join(otherListrdd).collect())

print("\nleft outer join: ", listrdd.leftOuterJoin(otherListrdd).collect())

print("\nright outer join: ", listrdd.rightOuterJoin(otherListrdd).collect())

print("\ncogroup: ",  listrdd.cogroup(otherListrdd).collect())




Subtract By Key:  [('english', 65)]

Join:  [('physics', (66, 66)), ('physics', (66, 90)), ('physics', (61, 66)), ('physics', (61, 90)), ('physics', (87, 66)), ('physics', (87, 90)), ('maths', (50, 100)), ('maths', (50, 20)), ('maths', (60, 100)), ('maths', (60, 20))]

left outer join:  [('physics', (66, 66)), ('physics', (66, 90)), ('physics', (61, 66)), ('physics', (61, 90)), ('physics', (87, 66)), ('physics', (87, 90)), ('maths', (50, 100)), ('maths', (50, 20)), ('maths', (60, 100)), ('maths', (60, 20)), ('english', (65, None))]

right outer join:  [('physics', (66, 66)), ('physics', (66, 90)), ('physics', (61, 66)), ('physics', (61, 90)), ('physics', (87, 66)), ('physics', (87, 90)), ('maths', (50, 100)), ('maths', (50, 20)), ('maths', (60, 100)), ('maths', (60, 20))]

cogroup:  [('physics', (<pyspark.resultiterable.ResultIterable object at 0x0B61C1F0>, <pyspark.resultiterable.ResultIterable object at 0x0B61CA70>)), ('maths', (<pyspark.resultiterable.ResultIterable object at 0x0B6

In [272]:
#Join + MapValues + AggregateByKey
print("Input1:", inputrdd.collect())
print("Input2:", otherListrdd.collect())
joinrdd = listrdd.join(otherListrdd)
print("\nJoin:", joinrdd.collect())
maprdd = joinrdd.mapValues(lambda x: x[0] + x[1])
print("\nJoin+MapValues:", maprdd.collect())
joinMapAgg = maprdd.aggregateByKey(0, lambda x,y: x + y, lambda x,y: x + y)
print("\nJoin+Map+Agg:", joinMapAgg.collect())

Input1: [('physics', 66), ('maths', 50), ('maths', 60), ('physics', 61), ('english', 65), ('physics', 87)]
Input2: [('maths', 100), ('physics', 66), ('physics', 90), ('maths', 20)]

Join: [('physics', (66, 66)), ('physics', (66, 90)), ('physics', (61, 66)), ('physics', (61, 90)), ('physics', (87, 66)), ('physics', (87, 90)), ('maths', (50, 100)), ('maths', (50, 20)), ('maths', (60, 100)), ('maths', (60, 20))]

Join+MapValues: [('physics', 132), ('physics', 156), ('physics', 127), ('physics', 151), ('physics', 153), ('physics', 177), ('maths', 150), ('maths', 70), ('maths', 160), ('maths', 80)]

Join+Map+Agg: [('physics', 896), ('maths', 460)]


<h1>Pair RDD Actions

In [254]:
print('\nCount by key: ',wordpair.countByKey())
print('\nCollect As Map: ', wordpair.collectAsMap()) #if there are multiple keys with different values, then collectASMap() will collect by returning the updated value with respect to the key
print('\nLookup', wordpair.lookup('MongoDB'))


Count by key:  defaultdict(<class 'int'>, {'Mongodb': 4, 'MongoDB': 7, 'Checking': 1, 'Customer': 1, 'SSH': 1})

Collect As Map:  {'Mongodb': 'connection', 'MongoDB': 'SSH', 'Checking': 'the', 'Customer': 'meeting', 'SSH': 'gathering'}

Lookup ['-', 'connection', 'dll', 'OOM', 'customer', 'SSH', 'SSH']


In [None]:
#Persisting 

#word.persist() or word.cache()
#word.unpersist()


In [None]:
#Partitioning
word.getNumPartitions()

In [None]:
#RDD to Data frame
columns = ['title']
df = word.toDF(columns)
