# Python RDD Playground

## Initialize Spark

In [1]:
!pip install findspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install pyspark==3.4.1

Defaulting to user installation because normal site-packages is not writeable


In [3]:
from pyspark import SparkContext, SparkConf

In [4]:
conf = SparkConf().setAppName("test").setMaster("local[*]")
sc = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/02 22:44:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Basic Operations

In [5]:
data=[1,2,3,4,5,6,7,8,9,10]

In [6]:
rdd1=sc.parallelize(data)

In [7]:
rdd2 = rdd1.map(lambda x: x+1)

In [8]:
rdd2.take(10)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [9]:
rdd2.foreach(print)

6
7
2
3
8
9
10
11
4
5


In [10]:
coll = rdd2.collect()

In [11]:
for val in coll:
    print(val)

2
3
4
5
6
7
8
9
10
11


In [12]:
rdd3 = rdd1.filter(lambda x : x % 2 == 0)

In [13]:
rdd3.take(5)

[2, 4, 6, 8, 10]

In [14]:
rdd3.foreach(print)

2
4
6
8
10


In [15]:
rdd4 = sc.parallelize([1, 2, 2, 3, 3, 3, 4])

In [16]:
rdd5 = rdd4.distinct()

In [17]:
rdd5.foreach(print)

4
3
1
2


## Nested Lists

In [18]:
x = [[1,2,3],[2,3],[3],[3]]

In [19]:
print(x)

[[1, 2, 3], [2, 3], [3], [3]]


In [20]:
rdd6 = sc.parallelize(x)

In [21]:
rdd7 = rdd6.flatMap(lambda xs : [x for x in xs])

In [22]:
rdd7.collect()

[1, 2, 3, 2, 3, 3, 3]

## Wordcount in Spark!

In [23]:
lines=sc.textFile("word-count/input/Complete-Shakespeare.txt")

In [24]:
rdd8 = lines.flatMap(lambda s : s.split(" "))

In [25]:
rdd8.take(4)

['This', 'is', 'the', '100th']

In [26]:
rdd9 = rdd8.map(lambda s: (s, 1))

In [27]:
rdd9.take(2)

[('This', 1), ('is', 1)]

In [28]:
counts = rdd9.reduceByKey(lambda a, b: a+b)

In [29]:
counts.take(2)

[('is', 7851), ('Etext', 4)]

In [30]:
s = counts.sample(False, 0.01)

In [31]:
s.count()

687

In [32]:
s.take(10)

[('Project', 13),
 ('GUTENBERG', 221),
 ('YOU', 31),
 ('do', 2835),
 ('etext/etext93]', 1),
 ('money', 92),
 ('Will', 454),
 ('Then', 581),
 ('low', 60),
 ('belongs', 14)]

## Sampling

In [33]:
rdd9 = sc.parallelize([1, 2, 3, 2, 3, 4, 5, 6, 6])

Run the sample command below twice to see how it is random. The first arguments is whether to sample with replacement or without. We chose without.

In [34]:
s = rdd9.sample(False, 0.5)

In [35]:
s.collect()

[1, 2, 4, 5, 6]

With the seed argument (third), the sample is the same each time we run the code.

In [36]:
s = rdd9.sample(False, 0.5, 12345)

In [37]:
s.collect()

[2, 2, 3, 5, 6]

## Set Operations 
### (Union, Intersection, Subtraction, Cartesian Product)

In [38]:
set1 = sc.parallelize([1,2,3])

In [39]:
set2 = sc.parallelize([3,4,5])

In [40]:
set3 = set1.union(set2)

In [41]:
set3.collect()

[1, 2, 3, 3, 4, 5]

In [42]:
set3 = set1.intersection(set2)

In [43]:
set3.collect()

[3]

In [44]:
set3 = set1.subtract(set2)

In [45]:
set3.collect()

[1, 2]

In [46]:
set3 = set1.cartesian(set2)

In [47]:
set3.collect()

[(1, 3), (1, 4), (1, 5), (2, 3), (2, 4), (2, 5), (3, 3), (3, 4), (3, 5)]

## Playing with Pairs

In [48]:
set4 = set3.groupByKey()

In [49]:
set4.foreach(print)

(3, <pyspark.resultiterable.ResultIterable object at 0xffff98518220>)
(1, <pyspark.resultiterable.ResultIterable object at 0xffff9851a500>)
(2, <pyspark.resultiterable.ResultIterable object at 0xffff98519e40>)


In [50]:
set5 = set3.mapValues(lambda x: x * 10)

In [51]:
set5.take(3)

[(1, 30), (1, 40), (1, 50)]

In [52]:
set6 = set3.sortByKey(False)

In [53]:
set6.collect()

[(3, 3), (3, 4), (3, 5), (2, 3), (2, 4), (2, 5), (1, 3), (1, 4), (1, 5)]

In [54]:
set7 = set3.keys()

In [55]:
set7.collect()

[1, 1, 1, 2, 2, 2, 3, 3, 3]

In [56]:
set7.distinct().collect()

[1, 2, 3]

In [57]:
set8 = set3.values()

In [58]:
set8.collect()

[3, 4, 5, 3, 4, 5, 3, 4, 5]

# Whole Text Files

In [59]:
folder = sc.wholeTextFiles('word-count/input')

In [60]:
files = folder.keys()

In [61]:
files.collect()

['file:/home/amit/Documents/CS535-resources/examples/spark/word-count/input/Alice-in-Wonderland.txt',
 'file:/home/amit/Documents/CS535-resources/examples/spark/word-count/input/Bill-of-Rights.txt',
 'file:/home/amit/Documents/CS535-resources/examples/spark/word-count/input/Complete-Shakespeare.txt',
 'file:/home/amit/Documents/CS535-resources/examples/spark/word-count/input/Decl-of-Ind-USA.txt',
 'file:/home/amit/Documents/CS535-resources/examples/spark/word-count/input/Encyclopaedia.txt',
 'file:/home/amit/Documents/CS535-resources/examples/spark/word-count/input/Flatland.txt',
 'file:/home/amit/Documents/CS535-resources/examples/spark/word-count/input/Gettysburg-Address.txt',
 'file:/home/amit/Documents/CS535-resources/examples/spark/word-count/input/Gift-of-the-Magi.txt',
 'file:/home/amit/Documents/CS535-resources/examples/spark/word-count/input/Les-Miserables.txt',
 'file:/home/amit/Documents/CS535-resources/examples/spark/word-count/input/Patrick-Henry.txt',
 'file:/home/amit/Do

In [65]:
import os
names = files.map(lambda s : os.path.basename(s))

In [66]:
names.collect()

['Alice-in-Wonderland.txt',
 'Bill-of-Rights.txt',
 'Complete-Shakespeare.txt',
 'Decl-of-Ind-USA.txt',
 'Encyclopaedia.txt',
 'Flatland.txt',
 'Gettysburg-Address.txt',
 'Gift-of-the-Magi.txt',
 'Les-Miserables.txt',
 'Patrick-Henry.txt',
 'Scarlet-Letter.txt',
 'Through-the-Looking-Glass.txt',
 'Tom-Sawyer-Abroad.txt',
 'US-Constitution.txt']

In [67]:
content = folder.values()

In [69]:
content.count()

14

In [70]:
sizes = content.map(lambda s : len(s))

In [71]:
sizes.collect()

[158315,
 10641,
 5458248,
 15890,
 8441343,
 203918,
 1688,
 21421,
 3263187,
 14678,
 517303,
 178845,
 183972,
 34553]