### Import the required libraries then Create SparkContext

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar xf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

In [3]:
!ls

sample_data  spark-3.5.1-bin-hadoop3  spark-3.5.1-bin-hadoop3.tgz


### Create and display an RDD from the following list

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [5]:
sc = spark.sparkContext
sc

### Create a sample1.txt file to contain the text shown below.

In [6]:
print('''
Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Collatio igitur ista tenihil iuvat.
Honesta oratio, Socratica, Platonis etiam.
Primum in nostranepotestate est, quid meminerimus?
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum?
Si quidem, inquit, tollerem,''')


Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
Collatio igitur ista tenihil iuvat. 
Honesta oratio, Socratica, Platonis etiam. 
Primum in nostranepotestate est, quid meminerimus? 
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum? 
Si quidem, inquit, tollerem,


### Read sample1.txt file into RDD and displaying the first 4 elements

In [11]:
!gdown --id 1uchxn7gfk6ybKgjm3eZJr5TBG-RUFnuJ

Downloading...
From: https://drive.google.com/uc?id=1uchxn7gfk6ybKgjm3eZJr5TBG-RUFnuJ
To: /content/sample1.txt
100% 352/352 [00:00<00:00, 1.40MB/s]


In [12]:
!ls

 sample1.txt   spark-3.5.1-bin-hadoop3	    'view?usp=drive_link'
 sample_data   spark-3.5.1-bin-hadoop3.tgz


In [13]:
txt_rdd = sc.textFile('sample1.txt')
txt_rdd.take(4)

['Utilitatis causa amicitia est quaesita.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. ',
 'Collatio igitur ista tenihil iuvat. ',
 'Honesta oratio, Socratica, Platonis etiam. ']

### Count the total number of rows in RDD

In [14]:
rdd_count = txt_rdd.count()
rdd_count

8

### Create a function to convert the data into lower case and splitting it

In [17]:
rdd_convert_lower = txt_rdd.map(lambda x: x.lower().split())
rdd_convert_lower.collect()

[['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['collatio', 'igitur', 'ista', 'tenihil', 'iuvat.'],
 ['honesta', 'oratio,', 'socratica,', 'platonis', 'etiam.'],
 ['primum', 'in', 'nostranepotestate', 'est,', 'quid', 'meminerimus?'],
 ['duo', 'reges:', 'constructio', 'interrete.'],
 ['quid,',
  'sietiam',
  'iucunda',
  'memoria',
  'est',
  'praeteritorum',
  'malorum?'],
 ['si', 'quidem,', 'inquit,', 'tollerem,']]

### Remove the stopwords from the previous text. i.e. Remove it.

In [21]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']
# Hint: you may need use flatMap

In [22]:
rdd_stopwords= rdd_convert_lower.flatMap(lambda x: [word for word in x if word not in stopwords])
rdd_stopwords.collect()

['utilitatis',
 'causa',
 'amicitia',
 'est',
 'quaesita.',
 'lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,',
 'consectetur',
 'adipiscing',
 'elit.',
 'collatio',
 'igitur',
 'ista',
 'tenihil',
 'iuvat.',
 'honesta',
 'oratio,',
 'socratica,',
 'platonis',
 'etiam.',
 'primum',
 'in',
 'nostranepotestate',
 'est,',
 'quid',
 'meminerimus?',
 'duo',
 'reges:',
 'constructio',
 'interrete.',
 'quid,',
 'sietiam',
 'iucunda',
 'memoria',
 'est',
 'praeteritorum',
 'malorum?',
 'si',
 'quidem,',
 'inquit,',
 'tollerem,']

### Find the words starting with ‘c’

In [23]:
rdd_c = rdd_stopwords.filter(lambda x: x.startswith('c'))
rdd_c.collect()

['causa', 'consectetur', 'collatio', 'constructio']

### Reduce the data by key and sum it (use the data from the following list)

In [25]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27)
       , ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]
# Hint: use reduceByKey

In [26]:
s1 = sc.parallelize(list)

In [27]:
s2 = s1.reduceByKey(lambda x,y: x+y)
s2.collect()

[('Jimin', 38),
 ('RM', 60),
 ('J-Hope', 37),
 ('Suga', 51),
 ('JK', 54),
 ('V', 68),
 ('Jin', 61)]

### Creat some key value pairs RDDs

In [28]:
rdd1 = sc.parallelize([('a',2),('b',3)])
rdd2 = sc.parallelize([('a',9),('b',7),('c',10)])

### Perform Join operation on the RDDs (rdd1,rdd2)

In [29]:
rdd_join = rdd1.join(rdd2)
rdd_join.collect()

[('b', (3, 7)), ('a', (2, 9))]