### In order for Python to find the Spark, download the findspark library and start it with findspark.init() function.

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
     ---------------------------------------- 0.0/317.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/317.0 MB ? eta -:--:--
     -------------------------------------- 0.0/317.0 MB 435.7 kB/s eta 0:12:08
     -------------------------------------- 0.1/317.0 MB 762.6 kB/s eta 0:06:56
     ---------------------------------------- 0.2/317.0 MB 1.2 MB/s eta 0:04:14
     ---------------------------------------- 0.4/317.0 MB 1.7 MB/s eta 0:03:05
     ---------------------------------------- 0.6/317.0 MB 2.2 MB/s eta 0:02:26
     ---------------------------------------- 0.7/317.0 MB 2.3 MB/s eta 0:02:15
     ---------------------------------------- 0.7/317.0 MB 2.3 MB/s eta 0:02:15
     ---------------------------------------- 0.7/317.0 MB 1.8 MB/s eta 0:02:53
     ---------------------------------------- 0.7/317.0 MB 1.8 MB/s eta 0:02:53
     ---------------------------------------- 1.2/317.0 MB 2.6 MB/s e

In [8]:
import findspark
findspark.init()

ModuleNotFoundError: No module named 'findspark'

In [6]:
!pip show pyspark

Name: pyspark
Version: 3.5.1
Summary: Apache Spark Python API
Home-page: https://github.com/apache/spark/tree/master/python
Author: Spark Developers
Author-email: dev@spark.apache.org
License: http://www.apache.org/licenses/LICENSE-2.0
Location: C:\Users\peter\miniconda3\envs\big_data_env\Lib\site-packages
Requires: py4j
Required-by: 


### In order to work with RDDs, we need to create a SparkContext.

In [9]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf


## Since we write local [*] in the master, it will use all cores in our machine. If we said local [4] it will work with 4 cores.

## getOrCreate is used to create a SparkSession if not present.

In [10]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("WordCount")\
    .getOrCreate()

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

In [None]:
sc=spark.sparkContext

## Read Data - RomeoJuliet Txt File

In [None]:
veri_dosyasi="romeojuliet.txt"

In [None]:
# read the text file
shakespeare_rdd=sc.textFile(veri_dosyasi)

In [None]:
# show the first 100 lines
shakespeare_rdd.take(100)

['',
 "                    WILLIAM SHAKESPEARE'S",
 '',
 '                       ROMEO & JULIET',
 '',
 '   ADAPTED FOR THE SCREEN BY CRAIG PEARCE AND BAZ LUHRMANN',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '                                       FINAL SHOOTING SCRIPT',
 '',
 '                                             October 6, 1995',
 '',
 '',
 '',
 '',
 'EXT.  HIGHWAY.  AFTERNOON.',
 '',
 'A ribbon of freeway stretching into a blue and pink late',
 'afternoon sky. A huge dark sedan, windows tinted gold,',
 'headlights blazing, powers directly for us.',
 '',
 'CUT TO: A heavy, low-slung, pickup truck traveling toward',
 'the sedan.',
 '',
 'WIDE SHOT: Sky, freeway, the cars closing.',
 '',
 'TIGHT ON: The sedan.',
 '',
 'TIGHT ON: The pickup.',
 '',
 'Like thunderous, jousting opponents, the cars pass in a',
 'deafening cacophony of noise.',
 '',
 'INT.  TRUCK.  AFTERNOON.',
 '',
 'TIGHT ON: 

In [None]:
# the number of lines in the file
shakespeare_rdd.count()

6247

# Remove Punctuation and Transform All Words to Lowercase.

### To exclude punctuation values and convert all words to lowercase, we wrote a function like the one below.

In [None]:
def lower_clean_str(x):
  punc='!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~-'
  lowercased_str = x.lower()
  for ch in punc:
    lowercased_str = lowercased_str.replace(ch, '')
  return lowercased_str

In [None]:
# map takes a function and applies it to all elements in the RDD
# map returns a new RDD so it is a transformation
shakespeare_rdd = shakespeare_rdd.map(lower_clean_str)
# the size of the RDD will be the same as the original RDD

In [None]:
shakespeare_rdd.take(40)

['',
 '                    william shakespeares',
 '',
 '                       romeo  juliet',
 '',
 '   adapted for the screen by craig pearce and baz luhrmann',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '                                       final shooting script',
 '',
 '                                             october 6 1995',
 '',
 '',
 '',
 '',
 'ext  highway  afternoon']

## We use split function to separate the words in all lines .

In [None]:
# it's one to many relationship because the input RDD size is less than the output RDD size
# flatMap takes a function and applies it to all elements in the RDD
# we 34an al atnan m4 nfs al size bst5dm flatMap
shakespeare_rdd=shakespeare_rdd.flatMap(lambda satir: satir.split(" "))

In [None]:
shakespeare_rdd.take(5)

['', '', '', '', '']

## We do a filtering below to exclude whitespaces.

In [None]:
# remove empty strings
# the output RDD size will be less than the input RDD size as it is a filter
# the fucntion return true if we want to keep the element and false if we want to remove it
hakespeare_rdd = shakespeare_rdd.filter(lambda x:x!='')

In [None]:
shakespeare_rdd.take(4)

['william', 'shakespeares', 'romeo', 'juliet']

## Count how many times each word occurs.
### To make this calculation we can apply the “reduceByKey” transformation on (key,val) pair RDD. We need to first convert “shakespeare_rdd” to (key,val) pair RDD.

### In this new (key,val) pair RDD (shakespeare_count), key is the word and val is 1 for each word in RDD (1 represents the number for the each word in “shakespeare_rdd”).


In [None]:
# return key,value pairs where the key is the word and the value is 1 for each word
shakespeare_count=shakespeare_rdd.map(lambda  word:(word,1))

In [None]:
shakespeare_count.take(4)

[('william', 1), ('shakespeares', 1), ('romeo', 1), ('juliet', 1)]

## Apply ReduceByKey to find frequent words

In [None]:
# this function will apply our reduce function to the entire RDD
# it doesn't take the key but it takes a fucntion to define the operation we wanna do
# the function takes two arguments as 2 different values for the same key and it returns theri sum
shakespeare_count_RBK=shakespeare_count.reduceByKey(lambda x,y:(x+y)).sortByKey()
# sortByKey will sort the RDD based on the key

In [None]:
shakespeare_count_RBK.take(40)

[('1995', 1),
 ('21', 1),
 ('6', 1),
 ('60', 2),
 ('9mm', 2),
 ('a', 563),
 ('abandoned', 1),
 ('able', 1),
 ('about', 3),
 ('above', 12),
 ('abra', 24),
 ('abras', 3),
 ('abroad', 1),
 ('abrupt', 1),
 ('abruptly', 5),
 ('absolved', 1),
 ('abuse', 2),
 ('abuses', 1),
 ('accidentally', 1),
 ('accompanied', 1),
 ('according', 1),
 ('accusation', 1),
 ('accustomed', 2),
 ('ache', 1),
 ('aches', 1),
 ('achingly', 2),
 ('acoustic', 1),
 ('across', 24),
 ('actually', 1),
 ('adagio', 1),
 ('adapted', 1),
 ('address', 1),
 ('addressed', 1),
 ('addresses', 1),
 ('adept', 1),
 ('adieu', 4),
 ('adjacent', 1),
 ('adjoining', 1),
 ('adjust', 1),
 ('admired', 1)]

### We want to sort the most frequent words in descending order. As the first step, we switch (key,val) pairs as (val,key).

In [None]:
shakespeare_count_RBK=shakespeare_count_RBK.map(lambda x:(x[1],x[0]))

In [None]:
shakespeare_count_RBK.take(5)

[(1, '1995'), (1, '21'), (1, '6'), (2, '60'), (2, '9mm')]

## We see that the most common word is "the". However, these values are words that we call stopwords which brings value to our analysis.

In [None]:
shakespeare_count_RBK.sortByKey(False).take(10)

[(1372, 'the'),
 (563, 'a'),
 (506, 'to'),
 (469, 'of'),
 (464, 'romeo'),
 (461, 'and'),
 (258, 'in'),
 (251, 'juliet'),
 (246, 'is'),
 (224, 'i')]

## To exclude stopwords words, we download the nltk library and get the list of English stopwords.

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stopwords =stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## When we exclude stopwords values, we see that the word "romeo" is the most common.

In [None]:
shakespeare_count_RBK = shakespeare_count_RBK.filter(lambda x: x[1] not in stopwords).sortByKey(False)

In [None]:
shakespeare_count_RBK.sortByKey(False).take(20)

[(464, 'romeo'),
 (251, 'juliet'),
 (143, 'mercutio'),
 (133, 'capulet'),
 (114, 'thou'),
 (111, 'benvolio'),
 (111, 'night'),
 (98, 'father'),
 (97, 'ext'),
 (96, 'close'),
 (96, 'nurse'),
 (92, 'cont'),
 (88, 'int'),
 (87, 'cut'),
 (84, 'car'),
 (82, 'love'),
 (81, 'laurence'),
 (79, 'tybalt'),
 (71, 'gloria'),
 (66, 'day')]