**Pyspark tutorial RDD**

In [2]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 38 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 60.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=5362b85f980948516d0adcb22795afa69783e64d680ac47220bccbc95de57cbc
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('PySpark-RDD').getOrCreate()
sc=spark.sparkContext

In [7]:
rdd= sc.parallelize([1,2,3,4,5])
rddCollect = rdd.collect()
print("Number of Partitions: "+str(rdd.getNumPartitions()))
print("Action: First element: "+str(rdd.first()))
print(rddCollect)

Number of Partitions: 2
Action: First element: 1
[1, 2, 3, 4, 5]


In [8]:
# empty rdd
rdd_empty= sc.parallelize([])

**PySpark RDD Transformations**

In [9]:
#Create RDD from external text file
rdd_txt = spark.sparkContext.textFile("/content/t1.txt")


In [10]:
for i in rdd_txt.collect():
  print(i)

Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by

In [11]:
# flatMap transformation
rdd2= rdd_txt.flatMap(lambda x:x.split(" "))

In [12]:
for i in rdd2.collect():
  print(i)

Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
by

In [13]:
# map transformation
rdd3= rdd2.map(lambda x:(x,1))
for i in rdd3.collect():
  print(i)

('Project', 1)
('Gutenberg’s', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('by', 1)
('Lewis', 1)
('Carroll', 1)
('This', 1)
('eBook', 1)
('is', 1)
('for', 1)
('the', 1)
('use', 1)
('of', 1)
('anyone', 1)
('anywhere', 1)
('at', 1)
('no', 1)
('cost', 1)
('and', 1)
('with', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('by', 1)
('Lewis', 1)
('Carroll', 1)
('This', 1)
('eBook', 1)
('is', 1)
('for', 1)
('the', 1)
('use', 1)
('of', 1)
('anyone', 1)
('anywhere', 1)
('at', 1)
('no', 1)
('cost', 1)
('and', 1)
('with', 1)
('This', 1)
('eBook', 1)
('is', 1)
('for', 1)
('the', 1)
('use', 1)
('of', 1)
('anyone', 1)
('anywhere', 1)
('at', 1)
('no', 1)
('cost', 1)
('and', 1)
('with', 1)
('Project', 1)
('Gutenberg’s', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('by', 1)
('Lewis', 1)
('Carroll', 1)
('This', 1)
('eBook', 1)
('is', 1)
('for', 1)
('the', 1)
('use', 1)
('of', 1)
('anyone', 1)
('anywhere', 1)
('at', 1)
('no', 1)
('cost', 1)
('and'

In [14]:
# reduceByKey
rdd4=rdd3.reduceByKey(lambda x,y: x+y)
for i in rdd4.collect():
  print(i)

('Project', 9)
('Gutenberg’s', 9)
('Alice’s', 18)
('in', 18)
('Lewis', 18)
('Carroll', 18)
('is', 27)
('use', 27)
('of', 27)
('anyone', 27)
('anywhere', 27)
('at', 27)
('no', 27)
('Adventures', 18)
('Wonderland', 18)
('by', 18)
('This', 27)
('eBook', 27)
('for', 27)
('the', 27)
('cost', 27)
('and', 27)
('with', 27)


In [15]:
# sortByKey
rdd5 = rdd4.map(lambda x: (x[1],x[0])).sortByKey()
for i in rdd5.collect():
  print(i)

(9, 'Project')
(9, 'Gutenberg’s')
(18, 'Alice’s')
(18, 'in')
(18, 'Lewis')
(18, 'Carroll')
(18, 'Adventures')
(18, 'Wonderland')
(18, 'by')
(27, 'is')
(27, 'use')
(27, 'of')
(27, 'anyone')
(27, 'anywhere')
(27, 'at')
(27, 'no')
(27, 'This')
(27, 'eBook')
(27, 'for')
(27, 'the')
(27, 'cost')
(27, 'and')
(27, 'with')


In [16]:
#filter
rdd6= rdd5.filter(lambda x: 'n' in x[1])
for i in rdd6.collect():
  print(i)

(9, 'Gutenberg’s')
(18, 'in')
(18, 'Adventures')
(18, 'Wonderland')
(27, 'anyone')
(27, 'anywhere')
(27, 'no')
(27, 'and')


**RDD Actions**

In [18]:
data=[("Z", 1),("A", 20),("B", 30),("C", 40),("B", 30),("B", 60)]
irdd=sc.parallelize(data)
lrdd=sc.parallelize([1,2,3,4,5,3,4])

In [21]:
from operator import add

In [19]:
# aggregate(zeroValue, seqOp, combOp)

seqOp = (lambda x, y: x + y)
combOp = (lambda x, y: x + y)
agg=lrdd.aggregate(0, seqOp, combOp)
print(agg) 


22


In [22]:
#reduce
redRes=lrdd.reduce(add)
print(redRes) 


22


In [23]:
# count
print("Count : "+str(irdd.count()))

Count : 6


In [27]:
#countByValue
print("countByValue :  "+str(lrdd.countByValue()))

countByValue :  defaultdict(<class 'int'>, {1: 1, 2: 1, 3: 2, 4: 2, 5: 1})


In [31]:
#first
print(irdd.first())

('Z', 1)


In [32]:
# top
print(irdd.top(3))

[('Z', 1), ('C', 40), ('B', 60)]


In [33]:
# min max
print(lrdd.min())

print(lrdd.max())

1
5


In [36]:
#take, takeOrdered
#takeOrdered() – Return the first num (smallest) elements from the dataset and this is the opposite of the take() action.
print("take : "+str(lrdd.take(6)))
print("takeOrdered : "+ str(lrdd.takeOrdered(6)))



take : [1, 2, 3, 4, 5, 3]
takeOrdered : [1, 2, 3, 3, 4, 4]
