### Import the required libraries then Create SparkContext

In [None]:
! pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 43 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 50.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=70a54cf77ee91fab33472c5622bb1b53b4a11a8bc41cb81e0cf3ac3251b14c57
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

### Create and display an RDD from the following list

In [None]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]

In [None]:
rdd1=sc.parallelize(list)
rdd1.collect()

[('JK', 22),
 ('V', 24),
 ('Jimin', 24),
 ('RM', 25),
 ('J-Hope', 25),
 ('Suga', 26),
 ('Jin', 27)]

### Read sample1.txt file into RDD and displaying the first 4 elements

In [None]:
distFile=sc.textFile('/content/sample_data/sample1.txt')
distFile.take(4)

['Text messages are used for personal, family, business and social purposes. Governmental and non-governmental organizations',
 ' use text messaging for communication between colleagues. In the 2010s, the sending of short informal messages became',
 ' an accepted part of many cultures, as happened earlier with emailing.[1] This makes texting a quick and easy way to communicate with friends, family and colleagues, including in contexts where a call would be impolite or inappropriate (e.g., calling very late at night or when one knows the other person is busy with family or work activities).',
 'Like e-mail and voicemail and unlike calls (in which the caller hopes to speak directly with the recipient)']

### Count the total number of rows in RDD

In [None]:
distFile.count()

9

### Create a function to convert the data into lower case and splitting it

In [None]:
def lower_split(line):
    return line.lower().split()

distFile.map(lower_split).collect()

[['text',
  'messages',
  'are',
  'used',
  'for',
  'personal,',
  'family,',
  'business',
  'and',
  'social',
  'purposes.',
  'governmental',
  'and',
  'non-governmental',
  'organizations'],
 ['use',
  'text',
  'messaging',
  'for',
  'communication',
  'between',
  'colleagues.',
  'in',
  'the',
  '2010s,',
  'the',
  'sending',
  'of',
  'short',
  'informal',
  'messages',
  'became'],
 ['an',
  'accepted',
  'part',
  'of',
  'many',
  'cultures,',
  'as',
  'happened',
  'earlier',
  'with',
  'emailing.[1]',
  'this',
  'makes',
  'texting',
  'a',
  'quick',
  'and',
  'easy',
  'way',
  'to',
  'communicate',
  'with',
  'friends,',
  'family',
  'and',
  'colleagues,',
  'including',
  'in',
  'contexts',
  'where',
  'a',
  'call',
  'would',
  'be',
  'impolite',
  'or',
  'inappropriate',
  '(e.g.,',
  'calling',
  'very',
  'late',
  'at',
  'night',
  'or',
  'when',
  'one',
  'knows',
  'the',
  'other',
  'person',
  'is',
  'busy',
  'with',
  'family',
  'o

### Filter the stopwords from the previous text

In [None]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']
# Hint: you may need use flatMap

In [None]:
flatRdd=distFile.flatMap(lower_split)
flatRdd.filter(lambda x:x not in stopwords).collect()

['text',
 'messages',
 'are',
 'used',
 'for',
 'personal,',
 'family,',
 'business',
 'social',
 'purposes.',
 'governmental',
 'non-governmental',
 'organizations',
 'use',
 'text',
 'messaging',
 'for',
 'communication',
 'between',
 'colleagues.',
 'in',
 '2010s,',
 'sending',
 'of',
 'short',
 'informal',
 'messages',
 'became',
 'accepted',
 'part',
 'of',
 'many',
 'cultures,',
 'happened',
 'earlier',
 'emailing.[1]',
 'this',
 'makes',
 'texting',
 'quick',
 'easy',
 'way',
 'to',
 'communicate',
 'friends,',
 'family',
 'colleagues,',
 'including',
 'in',
 'contexts',
 'where',
 'call',
 'would',
 'impolite',
 'or',
 'inappropriate',
 '(e.g.,',
 'calling',
 'very',
 'late',
 'at',
 'night',
 'or',
 'when',
 'one',
 'knows',
 'other',
 'person',
 'busy',
 'family',
 'or',
 'work',
 'activities).',
 'like',
 'e-mail',
 'voicemail',
 'unlike',
 'calls',
 '(in',
 'which',
 'caller',
 'hopes',
 'to',
 'speak',
 'directly',
 'recipient)',
 'texting',
 'does',
 'not',
 'require',
 '

### Filter the words starting with ‘c’

In [None]:
flatRdd.filter(lambda x:x[0]=='c').collect()

['communication',
 'colleagues.',
 'cultures,',
 'communicate',
 'colleagues,',
 'contexts',
 'call',
 'calling',
 'calls',
 'caller',
 'caller',
 'communication',
 'can',
 'contests.']

### Reduce the data by key and sum it (use the data from the following list)

In [None]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27)
       , ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]
# Hint: use reduceByKey

In [None]:
rdd=sc.parallelize(list)
rdd.reduceByKey(lambda x,y:x+y).collect()

[('Suga', 51),
 ('Jin', 61),
 ('JK', 54),
 ('V', 68),
 ('Jimin', 38),
 ('RM', 60),
 ('J-Hope', 37)]

### Creat some key value pairs RDDs

In [None]:
rdd1 = sc.parallelize([('a',2),('b',3)])
rdd2 = sc.parallelize([('a',9),('b',7),('c',10)])

### Perform Join operation on the RDDs (rdd1,rdd2)

In [None]:
rdd1.join(rdd2).collect()

[('b', (3, 7)), ('a', (2, 9))]