### Import the required libraries then Create SparkContext

In [1]:
from pyspark.sql import SparkSession

In [2]:
sc = SparkSession.builder.getOrCreate().sparkContext

In [3]:
sc

### Create and display an RDD from the following list

In [4]:
lst = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]

In [6]:
rdd = sc.parallelize(lst)

In [7]:
rdd.collect()

[('JK', 22),
 ('V', 24),
 ('Jimin', 24),
 ('RM', 25),
 ('J-Hope', 25),
 ('Suga', 26),
 ('Jin', 27)]

### Read sample1.txt file into RDD and displaying the first 4 elements

In [8]:
rdd_text = sc.textFile("sample1.txt")

### Count the total number of rows in RDD

In [9]:
rdd_text.count()

7

### Create a function to convert the data into lower case and splitting it

In [29]:
def to_lower(item):
    return str.lower(item).split(" ")
     

In [33]:
rdd_text1 = rdd_text.map(to_lower)

In [34]:
rdd_text1

PythonRDD[12] at RDD at PythonRDD.scala:53

In [35]:
rdd_text1.collect()

[['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.',
  ''],
 ['collatio',
  'igitur',
  'ista',
  'te',
  'nihil',
  'iuvat.',
  'honesta',
  'oratio,',
  'socratica,',
  'platonis',
  'etiam.',
  'primum',
  'in',
  'nostrane',
  'potestate',
  'est,',
  'quid',
  'meminerimus?',
  ''],
 ['duo', 'reges:', 'constructio', 'interrete.', ''],
 ['quid,',
  'si',
  'etiam',
  'iucunda',
  'memoria',
  'est',
  'praeteritorum',
  'malorum?',
  'si',
  'quidem,',
  'inquit,',
  'tollerem,',
  'sed',
  'relinquo.',
  'an',
  'nisi',
  'populari',
  'fama?'],
 [''],
 ['quamquam',
  'id',
  'quidem',
  'licebit',
  'iis',
  'existimare,',
  'qui',
  'legerint.',
  'summum',
  'a',
  'vobis',
  'bonum',
  'voluptas',
  'dicitur.',
  'at',
  'hoc',
  'in',
  'eo',
  'm.',
  'refert',
  'tamen,',
  'quo',
  'modo.',
  'quid',
  'sequatur,',
  'quid',
  'repugnet,',
  'vident.',
  'iam',
  'id',
  '

### Filter the stopwords from the previous text

In [145]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']
# Hint: you may need use flatMap

In [146]:
def remove_stops(lst):
    stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with',""]
    for word in stopwords:
        if word in lst:
            lst.remove(word)
    return lst

In [147]:
rdd_text2 = rdd_text1.flatMap(remove_stops)

In [148]:
rdd_text2.collect()

['utilitatis',
 'causa',
 'amicitia',
 'est',
 'quaesita.',
 'lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,',
 'consectetur',
 'adipiscing',
 'elit.',
 'collatio',
 'igitur',
 'ista',
 'te',
 'nihil',
 'iuvat.',
 'honesta',
 'oratio,',
 'socratica,',
 'platonis',
 'etiam.',
 'primum',
 'in',
 'nostrane',
 'potestate',
 'est,',
 'quid',
 'meminerimus?',
 'duo',
 'reges:',
 'constructio',
 'interrete.',
 'quid,',
 'si',
 'etiam',
 'iucunda',
 'memoria',
 'est',
 'praeteritorum',
 'malorum?',
 'si',
 'quidem,',
 'inquit,',
 'tollerem,',
 'sed',
 'relinquo.',
 'nisi',
 'populari',
 'fama?',
 'quamquam',
 'id',
 'quidem',
 'licebit',
 'iis',
 'existimare,',
 'qui',
 'legerint.',
 'summum',
 'vobis',
 'bonum',
 'voluptas',
 'dicitur.',
 'at',
 'hoc',
 'in',
 'eo',
 'm.',
 'refert',
 'tamen,',
 'quo',
 'modo.',
 'quid',
 'sequatur,',
 'quid',
 'repugnet,',
 'vident.',
 'iam',
 'id',
 'ipsum',
 'absurdum,',
 'maximum',
 'malum',
 'neglegi.']

In [None]:
rdd_text2 = rdd_text2

### Filter the words starting with ‘c’

In [153]:
rdd_text4 = rdd_text2.filter(lambda x: x[0] == "c")

In [154]:
rdd_text4.collect()

['causa', 'consectetur', 'collatio', 'constructio']

### Reduce the data by key and sum it (use the data from the following list)

In [155]:
lst = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27)
       , ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]
# Hint: use reduceByKey

In [156]:
rdd = sc.parallelize(lst)

In [158]:
rdd.collect()

[('JK', 22),
 ('V', 24),
 ('Jimin', 24),
 ('RM', 25),
 ('J-Hope', 25),
 ('Suga', 26),
 ('Jin', 27),
 ('J-Hope', 12),
 ('Suga', 25),
 ('Jin', 34),
 ('JK', 32),
 ('V', 44),
 ('Jimin', 14),
 ('RM', 35)]

In [159]:
rdd.reduceByKey(lambda x,y: x+y).collect()

[('Suga', 51),
 ('Jin', 61),
 ('JK', 54),
 ('V', 68),
 ('Jimin', 38),
 ('RM', 60),
 ('J-Hope', 37)]

### Creat some key value pairs RDDs

In [161]:
rdd1 = sc.parallelize([('a',2),('b',3)])
rdd2 = sc.parallelize([('a',9),('b',7),('c',10)])

### Perform Join operation on the RDDs (rdd1,rdd2)

In [164]:
rdd1.join(rdd2).collect()

[('b', (3, 7)), ('a', (2, 9))]