###  1.Importing Libraries

Even after successful install PySpark you may have issues importing pyspark in Python, you can resolve it by installing and import findspark.
findspark searches pyspark installation on the server and adds PySpark installation path to sys.path at runtime so that you can import PySpark modules.

In [1]:
import findspark
findspark.find()
findspark.init()

### 2. Create SparkContext() which atcs like an entry point to the spark

In [2]:
from pyspark import SparkContext,SparkConf
sc = SparkContext("local","PySpark Word Count Example")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/08/21 09:59:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### 3. Actual Implementation of pyspark
Reading a text file
Note that here "word.txt" is a RDD and we used "map", "flatmap", "reducebykey" transformations
we have split the words using single space as separator.

In [3]:
words = sc.textFile("words.txt").flatMap(lambda line: line.split(" "))

### giving us a list of all words spliting it by spaces from the sentence

In [4]:
words.collect()

                                                                                

['word',
 'count',
 'from',
 'Wikipedia',
 'the',
 'free',
 'encyclopedia',
 'the',
 'word',
 'count',
 'is',
 'the',
 'number',
 'of',
 'words',
 'in',
 'a',
 'document',
 'or',
 'passage',
 'of',
 'text',
 'Word',
 'counting',
 'may',
 'be',
 'needed',
 'when',
 'a',
 'text',
 'is',
 'required',
 'to',
 'stay',
 'within',
 'certain',
 'numbers',
 'of',
 'words',
 'This',
 'may',
 'particularly',
 'be',
 'the',
 'case',
 'in',
 'academia',
 'legal',
 'proceedings',
 'journalism',
 'and',
 'advertising',
 'Word',
 'count',
 'is',
 'commonly',
 'used',
 'by',
 'translators',
 'to',
 'determine',
 'the',
 'price',
 'for',
 'the',
 'translation',
 'job',
 'Word',
 'counts',
 'may',
 'also',
 'be',
 'used',
 'to',
 'calculate',
 'measures',
 'of',
 'readability',
 'and',
 'to',
 'measure',
 'typing',
 'and',
 'reading',
 'speeds',
 'usually',
 'in',
 'words',
 'per',
 'minute',
 'When',
 'converting',
 'character',
 'counts',
 'to',
 'words',
 'a',
 'measure',
 'of',
 'five',
 'or',
 'six'

Then we will map each word to a key:value pair of word:1, 1 being the number of occurrences.

In [5]:
a = words.map(lambda word: (word, 1))
a.collect()

[('word', 1),
 ('count', 1),
 ('from', 1),
 ('Wikipedia', 1),
 ('the', 1),
 ('free', 1),
 ('encyclopedia', 1),
 ('the', 1),
 ('word', 1),
 ('count', 1),
 ('is', 1),
 ('the', 1),
 ('number', 1),
 ('of', 1),
 ('words', 1),
 ('in', 1),
 ('a', 1),
 ('document', 1),
 ('or', 1),
 ('passage', 1),
 ('of', 1),
 ('text', 1),
 ('Word', 1),
 ('counting', 1),
 ('may', 1),
 ('be', 1),
 ('needed', 1),
 ('when', 1),
 ('a', 1),
 ('text', 1),
 ('is', 1),
 ('required', 1),
 ('to', 1),
 ('stay', 1),
 ('within', 1),
 ('certain', 1),
 ('numbers', 1),
 ('of', 1),
 ('words', 1),
 ('This', 1),
 ('may', 1),
 ('particularly', 1),
 ('be', 1),
 ('the', 1),
 ('case', 1),
 ('in', 1),
 ('academia', 1),
 ('legal', 1),
 ('proceedings', 1),
 ('journalism', 1),
 ('and', 1),
 ('advertising', 1),
 ('Word', 1),
 ('count', 1),
 ('is', 1),
 ('commonly', 1),
 ('used', 1),
 ('by', 1),
 ('translators', 1),
 ('to', 1),
 ('determine', 1),
 ('the', 1),
 ('price', 1),
 ('for', 1),
 ('the', 1),
 ('translation', 1),
 ('job', 1),
 ('Wo

### reduce by key internal working. It will try to check all values with key1 first then it will move to next key
hello [1] => hello 1 </br>
pyspark[a=1,b=1] => pyspark 2(a+b)

The result is then reduced by key, which is the word, and the values are added.

In [6]:
b = a.reduceByKey(lambda a,b:a + b)

In [7]:
b.collect()

[('word', 24),
 ('count', 11),
 ('from', 2),
 ('Wikipedia', 1),
 ('the', 38),
 ('free', 1),
 ('encyclopedia', 1),
 ('is', 19),
 ('number', 3),
 ('of', 25),
 ('words', 21),
 ('in', 11),
 ('a', 28),
 ('document', 2),
 ('or', 11),
 ('passage', 1),
 ('text', 8),
 ('Word', 3),
 ('counting', 6),
 ('may', 8),
 ('be', 8),
 ('needed', 1),
 ('when', 2),
 ('required', 1),
 ('to', 18),
 ('stay', 1),
 ('within', 1),
 ('certain', 2),
 ('numbers', 1),
 ('This', 2),
 ('particularly', 1),
 ('case', 1),
 ('academia', 1),
 ('legal', 1),
 ('proceedings', 1),
 ('journalism', 1),
 ('and', 23),
 ('advertising', 1),
 ('commonly', 1),
 ('used', 4),
 ('by', 5),
 ('translators', 1),
 ('determine', 1),
 ('price', 1),
 ('for', 10),
 ('translation', 1),
 ('job', 1),
 ('counts', 3),
 ('also', 5),
 ('calculate', 1),
 ('measures', 1),
 ('readability', 1),
 ('measure', 2),
 ('typing', 1),
 ('reading', 1),
 ('speeds', 1),
 ('usually', 3),
 ('per', 3),
 ('minute', 1),
 ('When', 1),
 ('converting', 1),
 ('character', 2),


### Wide transformation leads to seperate number of stages in sparkUI

### Last Step: To stop spark context

In [8]:
sc

In [9]:
sc.stop()