### Import the required libraries then Create SparkContext

In [1]:
import findspark
findspark.init()

In [5]:
# import pyspark
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Word Count") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
sc = spark.sparkContext

### Create and display an RDD from the following list

In [10]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]

In [13]:
rdd = sc.parallelize(list)
rdd.collect()

[('JK', 22),
 ('V', 24),
 ('Jimin', 24),
 ('RM', 25),
 ('J-Hope', 25),
 ('Suga', 26),
 ('Jin', 27)]

### Read sample1.txt file into RDD and displaying the first 4 elements

In [28]:
llT = [['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem','ipsum','dolor','sit','amet,','consectetur','adipiscing','elit.'],
 ['collatio',  'igitur','ista','te','nihil','iuvat.','honesta','oratio,','socratica,',
  'platonis','etiam.','primum','in','nostrane','potestate','est,','quid','meminerimus?'],
 ['duo', 'reges:', 'constructio', 'interrete.'],
 ['quid,','si','etiam','iucunda','memoria','est','praeteritorum','malorum?','si',
  'quidem,','inquit,','tollerem,','sed','relinquo.','an','nisi','populari','fama?']]

In [29]:
text =''
for li in llT:
    for st in li:
        text += st
        text += ' '
    text += '\n'
print(text)

utilitatis causa amicitia est quaesita. 
lorem ipsum dolor sit amet, consectetur adipiscing elit. 
collatio igitur ista te nihil iuvat. honesta oratio, socratica, platonis etiam. primum in nostrane potestate est, quid meminerimus? 
duo reges: constructio interrete. 
quid, si etiam iucunda memoria est praeteritorum malorum? si quidem, inquit, tollerem, sed relinquo. an nisi populari fama? 



In [57]:
%%writefile sample1.txt
utilitatis causa amicitia est quaesita. 
lorem ipsum dolor sit amet, consectetur adipiscing elit. 
collatio igitur ista te nihil iuvat. honesta oratio, socratica, platonis etiam.
primum in nostrane potestate est, quid meminerimus? 
duo reges: constructio interrete. 
quid, si etiam iucunda memoria est praeteritorum malorum? si quidem, inquit, tollerem, sed relinquo.
an nisi populari fama? 

Overwriting sample1.txt


In [58]:
rdd_sam = sc.textFile('sample1.txt')
rdd_sam.take(4)

['utilitatis causa amicitia est quaesita. ',
 'lorem ipsum dolor sit amet, consectetur adipiscing elit. ',
 'collatio igitur ista te nihil iuvat. honesta oratio, socratica, platonis etiam.',
 'primum in nostrane potestate est, quid meminerimus? ']

### Count the total number of rows in RDD

In [59]:
rdd_sam.count()

7

### Create a function to convert the data into lower case and splitting it

In [60]:
rdd_sam_lower = rdd_sam.map(lambda st_li : st_li.lower().split())
rdd_sam_lower.collect()

[['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['collatio',
  'igitur',
  'ista',
  'te',
  'nihil',
  'iuvat.',
  'honesta',
  'oratio,',
  'socratica,',
  'platonis',
  'etiam.'],
 ['primum', 'in', 'nostrane', 'potestate', 'est,', 'quid', 'meminerimus?'],
 ['duo', 'reges:', 'constructio', 'interrete.'],
 ['quid,',
  'si',
  'etiam',
  'iucunda',
  'memoria',
  'est',
  'praeteritorum',
  'malorum?',
  'si',
  'quidem,',
  'inquit,',
  'tollerem,',
  'sed',
  'relinquo.'],
 ['an', 'nisi', 'populari', 'fama?']]

### Filter the stopwords from the previous text

In [38]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']
# Hint: you may need use flatMap

In [44]:
rdd_sam_flat = rdd_sam.flatMap(lambda line : line.split())
rdd_sam_flat.take(10)

['utilitatis',
 'causa',
 'amicitia',
 'est',
 'quaesita.',
 'lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,']

In [46]:
rdd_sam_filter = rdd_sam_flat.filter(lambda STR : STR in stopwords)
rdd_sam_filter.take(10)

['an']

### Filter the words starting with ‘c’

In [47]:
rdd_sam_Cfilter = rdd_sam_flat.filter(lambda STR : STR[0]=='c')
rdd_sam_Cfilter.take(10)

['causa', 'consectetur', 'collatio', 'constructio']

### Reduce the data by key and sum it (use the data from the following list)

In [67]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27),
         ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]
# Hint: use reduceByKey

In [68]:
rdd_list = sc.parallelize(list)
rdd_list.collect()

[('JK', 22),
 ('V', 24),
 ('Jimin', 24),
 ('RM', 25),
 ('J-Hope', 25),
 ('Suga', 26),
 ('Jin', 27),
 ('J-Hope', 12),
 ('Suga', 25),
 ('Jin', 34),
 ('JK', 32),
 ('V', 44),
 ('Jimin', 14),
 ('RM', 35)]

In [75]:
rdd_list.groupByKey().mapValues(sum).sortByKey().collect()

[('J-Hope', 37),
 ('JK', 54),
 ('Jimin', 38),
 ('Jin', 61),
 ('RM', 60),
 ('Suga', 51),
 ('V', 68)]

In [76]:
rdd1 = sc.parallelize([('a',2),('b',3)])
rdd2 = sc.parallelize([('a',9),('b',7),('c',10)])

### Perform Join operation on the RDDs (rdd1,rdd2)

In [80]:
rdds = rdd1.join(rdd2)
rdds.collect()

[('b', (3, 7)), ('a', (2, 9))]