In [1]:
from pyspark import SparkConf, SparkContext
import collections

In [2]:
conf = SparkConf().setMaster("local").setAppName("RDD-Ops")
sc = SparkContext(conf = conf)

# Find total count of movie ratings

In [3]:
lines = sc.textFile("file:///C:/Users/dennis/Spark/Spark/resources/u.data")
print(lines)

file:///C:/Users/dennis/Spark/Spark/resources/u.data MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0


In [4]:
# Extract the required data 
ratings = lines.map(lambda x:x.split()[2])
print(ratings)

PythonRDD[2] at RDD at PythonRDD.scala:53


In [5]:
# Perform action on RDD
results = ratings.countByValue()
for result in results:   
    print(result)

3
1
2
4
5


In [6]:
# Sort and display the results
sortedResults = collections.OrderedDict(sorted(results.items()))
for key,value in sortedResults.items():   
    print(key,value)

1 6110
2 11370
3 27145
4 34174
5 21201


# Find average number of friends by age

Demo on key pair 

In [7]:
# Load data
lines = sc.textFile("file:///C:/Users/dennis/Spark/Spark/resources/fakefriends.csv")
print(lines)

file:///C:/Users/dennis/Spark/Spark/resources/fakefriends.csv MapPartitionsRDD[5] at textFile at NativeMethodAccessorImpl.java:0


In [8]:
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

In [9]:
# Parsing the input data to get ootput in form key/value pair of (age, numFriends) 
rdd=lines.map(parseLine)

In [10]:
mappedRDD = rdd.mapValues(lambda x: (x,1))

In [11]:
totalsByAge= mappedRDD.reduceByKey(lambda x,y:(x[0] + y[0],x[1] + y[1]))
print(totalsByAge)

PythonRDD[10] at RDD at PythonRDD.scala:53


In [12]:
averageByAge = totalsByAge.mapValues(lambda x:x[0]/x[1])

In [13]:
results = averageByAge.collect()
for result in results:
    print(result)

(33, 325.3333333333333)
(26, 242.05882352941177)
(55, 295.53846153846155)
(40, 250.8235294117647)
(68, 269.6)
(59, 220.0)
(37, 249.33333333333334)
(54, 278.0769230769231)
(38, 193.53333333333333)
(27, 228.125)
(53, 222.85714285714286)
(57, 258.8333333333333)
(56, 306.6666666666667)
(43, 230.57142857142858)
(36, 246.6)
(22, 206.42857142857142)
(35, 211.625)
(45, 309.53846153846155)
(60, 202.71428571428572)
(67, 214.625)
(19, 213.27272727272728)
(30, 235.8181818181818)
(51, 302.14285714285717)
(25, 197.45454545454547)
(21, 350.875)
(42, 303.5)
(49, 184.66666666666666)
(48, 281.4)
(50, 254.6)
(39, 169.28571428571428)
(32, 207.9090909090909)
(58, 116.54545454545455)
(64, 281.3333333333333)
(31, 267.25)
(52, 340.6363636363636)
(24, 233.8)
(20, 165.0)
(62, 220.76923076923077)
(41, 268.55555555555554)
(44, 282.1666666666667)
(69, 235.2)
(65, 298.2)
(61, 256.22222222222223)
(28, 209.1)
(66, 276.44444444444446)
(46, 223.69230769230768)
(29, 215.91666666666666)
(18, 343.375)
(47, 233.22222222222

# Find min and max temperature in a year

Demo for filtering in RDD. Filtering is for removing unwanted data from RDD.

In [14]:
# Load data
lines = sc.textFile("file:///C:/Users/dennis/Spark/Spark/resources/1800.csv")

In [15]:
def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0/5.0) + 32.0
    return (stationID, entryType, temperature)

In [16]:
# Parsing the input data to get output in form (stationID, entryType, temperature) 
rdd=lines.map(parseLine)

In [17]:
# Apply filter
minTemps = rdd.filter(lambda x: "TMIN" in x[1])
maxTemps = rdd.filter(lambda x: "TMAX" in x[1])

In [18]:
# Create key value pair for stationID, temperature
minStationTemps = minTemps.map(lambda x:(x[0],x[2]))
maxStationTemps = maxTemps.map(lambda x:(x[0],x[2]))

In [19]:
# Find min temp by stationID
minTemp = minStationTemps.reduceByKey(lambda x,y:min(x,y))
maxTemp = maxStationTemps.reduceByKey(lambda x,y:max(x,y))

In [20]:
# Collect and print min results 
minResults = minTemp.collect()
for result in minResults:
    print(result[0]+"\t{:.2f}F".format(result[1]))

ITE00100554	5.36F
EZE00100082	7.70F


In [21]:
# Collect and print max results 
maxResults = maxTemp.collect()
for result in maxResults:
    print(result[0]+"\t{:.2f}F".format(result[1]))

ITE00100554	90.14F
EZE00100082	90.14F


# Count number of word occurrence

#### Map vs Flatmap
Map transforms each element of an RDD into one new element. 
Flatmap can transform each element of an RDD into many new elements. 

In [22]:
# Load data
data = sc.textFile("file:///C:/Users/dennis/Spark/Spark/resources/book.txt")

In [23]:
import re
def normalizeWords(text):
    return re.compile(r'\W+',re.UNICODE).split(text.lower())

In [24]:
# Split to words
words = data.flatMap(normalizeWords)

In [25]:
# Count each word occurrence
wordCounts = words.map( lambda x:(x, 1)).reduceByKey(lambda x, y: x + y)

In [26]:
# Shuffle and Sort
wordCountSorted = sorted((v,k) for k,v in wordCounts.collect())

In [27]:
# Print results 
for result in wordCountSorted:
    count=str(result[0])
    word = result[1].encode('ascii','ignore')
    if(word):
        print(word , count)

b'0' 1
b'05' 1
b'07' 1
b'1000' 1
b'104' 1
b'1099' 1
b'1124' 1
b'12' 1
b'125' 1
b'13' 1
b'14' 1
b'15' 1
b'150' 1
b'17' 1
b'18' 1
b'19' 1
b'2006' 1
b'20081' 1
b'2009' 1
b'212' 1
b'28' 1
b'312' 1
b'360' 1
b'401' 1
b'43' 1
b'47' 1
b'49' 1
b'500' 1
b'60' 1
b'65' 1
b'68' 1
b'70' 1
b'800' 1
b'82' 1
b'85' 1
b'93' 1
b'abandon' 1
b'abandoned' 1
b'abbreviation' 1
b'absolutely' 1
b'absorbed' 1
b'abstract' 1
b'accelerator' 1
b'accepting' 1
b'accompanies' 1
b'accompany' 1
b'accomplishment' 1
b'accomplishments' 1
b'achievable' 1
b'achievement' 1
b'achieves' 1
b'achieving' 1
b'acknowledge' 1
b'acqui' 1
b'acquirers' 1
b'acted' 1
b'adapted' 1
b'adaptive' 1
b'added' 1
b'addictive' 1
b'admit' 1
b'admittedly' 1
b'admitting' 1
b'adopt' 1
b'adsenseformobileapps' 1
b'advance' 1
b'advanced' 1
b'advancement' 1
b'advantages' 1
b'advertorial' 1
b'advise' 1
b'advocates' 1
b'affected' 1
b'affiliated' 1
b'affordable' 1
b'afoul' 1
b'afterward' 1
b'afterwards' 1
b'ageist' 1
b'agenda' 1
b'aggregate' 1
b'aggregated' 1
b

b'furthermore' 1
b'gaining' 1
b'gallup' 1
b'gambling' 1
b'gamed' 1
b'garage' 1
b'gas' 1
b'gates' 1
b'gathered' 1
b'gauge' 1
b'gender' 1
b'generating' 1
b'generation' 1
b'generous' 1
b'genius' 1
b'geographic' 1
b'germany' 1
b'gestating' 1
b'glass' 1
b'gloat' 1
b'god' 1
b'godaddy' 1
b'goodbye' 1
b'grab' 1
b'graciously' 1
b'grades' 1
b'graduated' 1
b'graphics' 1
b'graphs' 1
b'grew' 1
b'gross' 1
b'grumpy' 1
b'guaranteed' 1
b'guarantees' 1
b'guard' 1
b'guts' 1
b'h' 1
b'h1' 1
b'h2' 1
b'habits' 1
b'hacking' 1
b'hall' 1
b'hallway' 1
b'handed' 1
b'handedly' 1
b'handful' 1
b'handing' 1
b'happening' 1
b'happier' 1
b'happily' 1
b'happiness' 1
b'harass' 1
b'hardest' 1
b'harmony' 1
b'harper' 1
b'harsh' 1
b'hassle' 1
b'hats' 1
b'headache' 1
b'heading' 1
b'healthy' 1
b'hearing' 1
b'heart' 1
b'heavily' 1
b'hell' 1
b'helpareporter' 1
b'helpcenter' 1
b'helped' 1
b'helpful' 1
b'highlight' 1
b'highlighted' 1
b'himself' 1
b'hinges' 1
b'hired' 1
b'hires' 1
b'history' 1
b'hits' 1
b'hitting' 1
b'hobbies' 1
b'h

b'washington' 1
b'wasteful' 1
b'watching' 1
b'water' 1
b'weather' 1
b'webpage' 1
b'weekends' 1
b'weekly' 1
b'weeks' 1
b'weigh' 1
b'welcome' 1
b'welcomes' 1
b'whatsapp' 1
b'whirlwind' 1
b'willingness' 1
b'window' 1
b'wireframes' 1
b'withdraw' 1
b'withhold' 1
b'wonder' 1
b'wondered' 1
b'wonderful' 1
b'wondering' 1
b'worded' 1
b'words' 1
b'workaholic' 1
b'workweek' 1
b'worlds' 1
b'worries' 1
b'xxxx' 1
b'y' 1
b'yahoo' 1
b'years2' 1
b'years3' 1
b'yoast' 1
b'york' 1
b'younger' 1
b'youth' 1
b'zealand' 1
b'zoho' 1
b'101' 2
b'2013' 2
b'29' 2
b'3d' 2
b'41' 2
b'55' 2
b'75' 2
b'99' 2
b'accomplishing' 2
b'accountants' 2
b'accounts' 2
b'accustomed' 2
b'acknowledgments' 2
b'acquire' 2
b'activities' 2
b'activity' 2
b'adapt' 2
b'adapting' 2
b'adjust' 2
b'adsense' 2
b'advertisements' 2
b'advisors' 2
b'aerospace' 2
b'aids' 2
b'algorithms' 2
b'alternative' 2
b'amateur' 2
b'analyze' 2
b'announce' 2
b'announcing' 2
b'anymore' 2
b'apart' 2
b'appeal' 2
b'application' 2
b'approve' 2
b'apps' 2
b'arm' 2
b'arms' 

b'throughout' 3
b'ties' 3
b'timely' 3
b'tips' 3
b'titles' 3
b'touch' 3
b'trademarks' 3
b'trained' 3
b'transition' 3
b'tread' 3
b'trigger' 3
b'true' 3
b'truly' 3
b'twofold' 3
b'type' 3
b'underserved' 3
b'university' 3
b'unlike' 3
b'updates' 3
b'version' 3
b'vesting' 3
b'viewed' 3
b'virtual' 3
b'walk' 3
b'wary' 3
b'wasted' 3
b'watch' 3
b'waters' 3
b'weekend' 3
b'wherever' 3
b'workplace' 3
b'worst' 3
b'worthy' 3
b'yield' 3
b'2011' 4
b'2015' 4
b'40' 4
b'access' 4
b'accomplish' 4
b'acquired' 4
b'active' 4
b'actual' 4
b'adventure' 4
b'afford' 4
b'ago' 4
b'alongside' 4
b'american' 4
b'among' 4
b'anywhere' 4
b'armed' 4
b'arrange' 4
b'article' 4
b'assessment' 4
b'authority' 4
b'aware' 4
b'basis' 4
b'becomes' 4
b'becoming' 4
b'belongings' 4
b'bill' 4
b'break' 4
b'bring' 4
b'built' 4
b'busy' 4
b'calls' 4
b'carrot' 4
b'challenging' 4
b'changes' 4
b'channel' 4
b'china' 4
b'clear' 4
b'clicking' 4
b'closing' 4
b'co' 4
b'cold' 4
b'collecting' 4
b'compare' 4
b'competitive' 4
b'competitors' 4
b'continue

b'old' 8
b'opportunities' 8
b'reserves' 8
b'rich' 8
b'security' 8
b'several' 8
b'soon' 8
b'stage' 8
b'successfully' 8
b'tasks' 8
b'taxes' 8
b'tend' 8
b'three' 8
b'took' 8
b'top' 8
b'triton' 8
b'under' 8
b'understanding' 8
b'useful' 8
b'user' 8
b'venture' 8
b'visit' 8
b'worry' 8
b'write' 8
b'www' 8
b'4' 9
b'accept' 9
b'achieve' 9
b'across' 9
b'affect' 9
b'alone' 9
b'amazon' 9
b'answer' 9
b'apply' 9
b'bit' 9
b'boss' 9
b'called' 9
b'cannot' 9
b'ceo' 9
b'clients' 9
b'college' 9
b'created' 9
b'creative' 9
b'deal' 9
b'demand' 9
b'easier' 9
b'following' 9
b'fraud' 9
b'goods' 9
b'government' 9
b'happens' 9
b'he' 9
b'hear' 9
b'house' 9
b'http' 9
b'increase' 9
b'involved' 9
b'loan' 9
b'manager' 9
b'maximize' 9
b'measuring' 9
b'meet' 9
b'minimum' 9
b'mobile' 9
b'option' 9
b'phone' 9
b'position' 9
b'purchasing' 9
b'quality' 9
b'rate' 9
b'reading' 9
b'relationship' 9
b'said' 9
b'savings' 9
b'share' 9
b'simply' 9
b'sometimes' 9
b'stock' 9
b'stream' 9
b'system' 9
b'tech' 9
b'text' 9
b'times' 9
b'twit

# Find the Total Amount Spent by Customer

In [28]:
# Load data
data = sc.textFile("file:///C:/Users/dennis/Spark/Spark/resources/customer-orders.csv")

In [29]:
def parseLine(data):
    fields = data.split(',')
    custId = int(fields[0])
    itemId = int(fields[1])
    amount = float(fields[2])
    return (custId, itemId, amount)

In [30]:
# Parsing the input data to get output in form (custId, itemId, amount) 
rdd=data.map(parseLine)

In [31]:
totalByCustomer = rdd.map(lambda x:(x[0],x[2])).reduceByKey(lambda x, y: x + y)

In [32]:
# Shuffle and Sort
totalByCustomerSorted = totalByCustomer.sortByKey()

In [33]:
# Collect results 
results = totalByCustomerSorted.collect()
for result in results:
    print(result[0],result[1])

0 5524.949999999998
1 4958.600000000001
2 5994.59
3 4659.63
4 4815.050000000002
5 4561.069999999999
6 5397.879999999998
7 4755.070000000001
8 5517.240000000001
9 5322.649999999999
10 4819.700000000001
11 5152.290000000002
12 4664.589999999998
13 4367.62
14 4735.030000000001
15 5413.510000000001
16 4979.06
17 5032.679999999999
18 4921.27
19 5059.4299999999985
20 4836.859999999999
21 4707.41
22 5019.449999999999
23 4042.6499999999987
24 5259.920000000003
25 5057.610000000001
26 5250.4
27 4915.889999999999
28 5000.709999999998
29 5032.529999999999
30 4990.72
31 4765.05
32 5496.050000000004
33 5254.659999999998
34 5330.8
35 5155.419999999999
36 4278.049999999997
37 4735.200000000002
38 4898.460000000002
39 6193.109999999999
40 5186.429999999999
41 5637.62
42 5696.840000000003
43 5368.83
44 4756.8899999999985
45 3309.38
46 5963.109999999999
47 4316.299999999999
48 4384.33
49 4394.599999999999
50 4517.27
51 4975.22
52 5245.059999999999
53 4945.299999999999
54 6065.389999999999
55 5298.090000

In [34]:
sc.stop()