# Word Counting Using Spark 

## Standard Library and create the Spark Context

In [1]:
from pyspark import SparkContext, SparkConf

#We can create a SparkConf() object and use it to initialize the spark context
conf = SparkConf().setAppName("Collinear Points").setMaster("local[4]") #Initialize spark context using 4 local cores as workers
sc = SparkContext(conf=conf)    

from pyspark.rdd import RDD


## Opening the File 
Define an RDD that will read the file 

## Download File From S3

In [2]:
%%time

##If this cell fails, download the file from https://mas-dse-open.s3.amazonaws.com/Moby-Dick.txt on your browser
# and put it in the '../../Data/ directory
import requests
data_dir='../../Data'
filename='Moby-Dick.txt'
url = "https://mas-dse-open.s3.amazonaws.com/"+filename
local_path = data_dir+'/'+filename
#!mkdir -p {data_dir}
# Copy URL content to local_path
r = requests.get(url, allow_redirects=True)
open(local_path, 'wb').write(r.content)

# check that the text file is where we expect it to be
!ls -l $local_path

-rw-r--r-- 1 asfetu 197121 1257260 Jul 11 23:54 ../../Data/Moby-Dick.txt
Wall time: 1.27 s


In [4]:
%%time
text_file=sc.textFile(local_path)
text_file

Wall time: 44.9 ms


## steps for counting words 
*  split line by space
*  Map each to tupe as (word,1)
*  Count the number of words occurance of each word 

In [5]:
words=text_file.flatMap(lambda line:line.split(" ") ) # Generate A long list of word 
not_empty=words.filter(lambda x:x!=' ')     # clean up operation , remove all empty space recoded as words in the aove list
key_values=not_empty.map(lambda word:(word,1)) # map words to (word,1)
counts= key_values.reduceByKey(lambda a,b:a+b) # for each key sum the value , that is counting the number of words 

In [7]:
## Counting the number of words and sum up all words in the file 
Count=counts.count()  # count the number of words
Sum=counts.map(lambda x:x[1]).reduce(lambda x,y:x+y) # x[1] is value of the key-value pair. sumup all the values 
print("The number of total words in the file is:",Sum)

The number of words in the file is: 219480


## Number of Total words

In [19]:
Sum

219480

## The number of different words 


In [18]:
Count

33782

## Mean number of  occurance per word

In [17]:
round(Sum/Count,3)

6.497

## Method 2 Using chain

In [20]:
word_pairs=text_file.flatMap(lambda x: x.split(' '))\
    .filter(lambda x: x!='')\
    .map(lambda word: (word,1))

# Counts occurance of words

In [21]:
counts=word_pairs.reduceByKey(lambda x,y:x+y)

## Reverse (word,count) to (count,word) and sort by key

In [22]:
reverse_counts=counts.map(lambda x:(x[1],x[0]))   # reverse order of word and count
sorted_counts=reverse_counts.sortByKey(ascending=False)

## Display the Top 5 most common words words 

In [29]:
D=sorted_counts.take(5)
print('most common words\n'+'\n'.join(['%d:\t%s'%c for c in D]))
D

most common words
13766:	the
6587:	of
5951:	and
4533:	a
4510:	to


[(13766, 'the'), (6587, 'of'), (5951, 'and'), (4533, 'a'), (4510, 'to')]