#### Application
A user program built on Spark using its APIs. It consists of a driver program and executors on the cluster.

#### SparkSession
An object that provides a point of entry to interact with underlying Spark functionality and allows programming Spark with its APIs. In an interactive Spark shell, the Spark driver instantiates a SparkSession for you, while in a Spark application, you create a SparkSession object yourself.

#### Job
A parallel computation consisting of multiple tasks that gets spawned in response to a Spark action (e.g., save(), collect()).

#### Stage
Each job gets divided into smaller sets of tasks called stages that depend on each other.

#### Task
A single unit of work or execution that will be sent to a Spark executor.

### Import the required libraries then Create SparkContext

In [41]:
#pip install findspark
# ! pip install numpy 
import findspark 
import numpy 
findspark.init()
import pyspark
from  pyspark.sql import SparkSession 

In [6]:
import pandas


In [7]:
pandas.__version__

'2.0.1'

In [42]:
spark_instance = SparkSession.builder.getOrCreate()

In [43]:
spark_instance

In [44]:
spark_context = spark_instance.sparkContext


### Create and display an RDD from the following list

In [45]:
df = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]

In [47]:
rdd0 = spark_context.parallelize(df)
rdd0

ParallelCollectionRDD[11] at readRDDFromFile at PythonRDD.scala:287

In [48]:
rdd0.collect()

[('JK', 22),
 ('V', 24),
 ('Jimin', 24),
 ('RM', 25),
 ('J-Hope', 25),
 ('Suga', 26),
 ('Jin', 27)]

In [49]:
rdd0.take(5)

[('JK', 22), ('V', 24), ('Jimin', 24), ('RM', 25), ('J-Hope', 25)]

### Create a sample1.txt file to contain the text shown below.

In [50]:
with open("sample1.txt", "w") as my_file:
    my_file.write('''Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
Collatio igitur ista tenihil iuvat.
Honesta oratio, Socratica, Platonis etiam. 
Primum in nostranepotestate est, quid meminerimus? 
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum?
Si quidem, inquit, tollerem,''')

### Read sample1.txt file into RDD and displaying the first 4 elements

In [51]:
distfile = spark_context.textFile("sample1.txt")

In [52]:
distfile.take(4)

['Utilitatis causa amicitia est quaesita.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. ',
 'Collatio igitur ista tenihil iuvat.',
 'Honesta oratio, Socratica, Platonis etiam. ']

### Count the total number of rows in RDD

In [53]:
#rdd_1 = distfile.map(lambda x: 1  )
len(distfile.collect())

8

### Create a function to convert the data into lower case and splitting it

In [62]:
rdd_split = distfile.map( lambda x:x.lower().split()  )
rdd_split.collect()

[['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['collatio', 'igitur', 'ista', 'tenihil', 'iuvat.'],
 ['honesta', 'oratio,', 'socratica,', 'platonis', 'etiam.'],
 ['primum', 'in', 'nostranepotestate', 'est,', 'quid', 'meminerimus?'],
 ['duo', 'reges:', 'constructio', 'interrete.'],
 ['quid,',
  'sietiam',
  'iucunda',
  'memoria',
  'est',
  'praeteritorum',
  'malorum?'],
 ['si', 'quidem,', 'inquit,', 'tollerem,']]

In [55]:
rdd_split.getNumPartitions()

2

### Remove the stopwords from the previous text. i.e. Remove it.

In [56]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']
# Hint: you may need use flatMap

In [68]:
rdd_split_remove = rdd_split.flatMap(lambda x: list(filter (lambda y :y not in stopwords ,x)) ) 

In [69]:
rdd_split_remove.collect()

['utilitatis',
 'causa',
 'amicitia',
 'est',
 'quaesita.',
 'lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,',
 'consectetur',
 'adipiscing',
 'elit.',
 'collatio',
 'igitur',
 'ista',
 'tenihil',
 'iuvat.',
 'honesta',
 'oratio,',
 'socratica,',
 'platonis',
 'etiam.',
 'primum',
 'in',
 'nostranepotestate',
 'est,',
 'quid',
 'meminerimus?',
 'duo',
 'reges:',
 'constructio',
 'interrete.',
 'quid,',
 'sietiam',
 'iucunda',
 'memoria',
 'est',
 'praeteritorum',
 'malorum?',
 'si',
 'quidem,',
 'inquit,',
 'tollerem,']

### Find the words starting with ‘c’

In [None]:
rdd_split.collect()
rdd_split.saveAsTextFile('rdd_split')


In [13]:
data = [['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['collatio', 'igitur', 'ista', 'tenihil', 'iuvat.'],
 ['honesta', 'oratio,', 'socratica,', 'platonis', 'etiam.'],
 ['primum', 'in', 'nostranepotestate', 'est,', 'quid', 'meminerimus?'],
 ['duo', 'reges:', 'constructio', 'interrete.'],
 ['quid,',
  'sietiam',
  'iucunda',
  'memoria',
  'est',
  'praeteritorum',
  'malorum?'],
 ['si', 'quidem,', 'inquit,', 'tollerem,']]

In [59]:
rdd_split_c = spark_context.parallelize(data)

In [60]:
rdd_split_c = rdd_split_c.flatMap(lambda x: list(filter (lambda y :y.startswith("c"),x)) ) 

In [61]:
rdd_split_c.collect()

['causa', 'consectetur', 'collatio', 'constructio']

### Reduce the data by key and sum it (use the data from the following list)

In [70]:
df = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27)
       , ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]
# Hint: use reduceByKey

In [71]:
rdd_key = spark_context.parallelize(df)

In [72]:
rdd_key_grouped = rdd_key.groupByKey()

In [73]:
rdd_key_grouped.collect()

[('JK', <pyspark.resultiterable.ResultIterable at 0x20ddd938d00>),
 ('J-Hope', <pyspark.resultiterable.ResultIterable at 0x20ddd938c10>),
 ('Suga', <pyspark.resultiterable.ResultIterable at 0x20ddd938820>),
 ('Jin', <pyspark.resultiterable.ResultIterable at 0x20ddd938760>),
 ('V', <pyspark.resultiterable.ResultIterable at 0x20ddd938280>),
 ('Jimin', <pyspark.resultiterable.ResultIterable at 0x20ddd938730>),
 ('RM', <pyspark.resultiterable.ResultIterable at 0x20ddd9388e0>)]

In [74]:
results = rdd_key_grouped.collect()
results = [ (k,list(v)) for  (k,v) in results ]
results

[('JK', [22, 32]),
 ('J-Hope', [25, 12]),
 ('Suga', [26, 25]),
 ('Jin', [27, 34]),
 ('V', [24, 44]),
 ('Jimin', [24, 14]),
 ('RM', [25, 35])]

In [75]:
rdd_key_grouped_sum = rdd_key_grouped.map(lambda x : (x[0],sum (x[-1])))
rdd_key_grouped_sum.collect()

[('JK', 54),
 ('J-Hope', 37),
 ('Suga', 51),
 ('Jin', 61),
 ('V', 68),
 ('Jimin', 38),
 ('RM', 60)]

### Creat some key value pairs RDDs

In [76]:
rdd1 = spark_context.parallelize([('a',2),('b',3)])
rdd2 = spark_context.parallelize([('a',9),('b',7),('c',10)])

In [77]:
rdd2.collect()

[('a', 9), ('b', 7), ('c', 10)]

In [78]:
rdd1.collect()

[('a', 2), ('b', 3)]

### Perform Join operation on the RDDs (rdd1,rdd2)

In [79]:
rdd1.join(rdd2).collect()

[('a', (2, 9)), ('b', (3, 7))]