In [1]:
from pyspark import SparkContext
sc = SparkContext()

## Word Count

#### Load the RDD from the text file

In [6]:
rdd_input = sc.textFile('data/HP.txt')
# Split the text into words and flatten the results. Why?
words = rdd_input.flatMap(lambda line: line.split())
words.collect()

['The',
 'place',
 'where',
 'things',
 'are',
 'hidden',
 'If',
 'you',
 'have',
 'to',
 'ask',
 'you',
 'will',
 'never',
 'know',
 'If',
 'you',
 'know',
 'you',
 'need',
 'only',
 'ask',
 'The',
 'hidden',
 'room',
 'of',
 'requirement']

#### Map the words in such a way, so that they can be counted later by aggregation

In [7]:
words_map = words.map(lambda x: (x,1))

In [9]:
words_map.collect()

[('The', 1),
 ('place', 1),
 ('where', 1),
 ('things', 1),
 ('are', 1),
 ('hidden', 1),
 ('If', 1),
 ('you', 1),
 ('have', 1),
 ('to', 1),
 ('ask', 1),
 ('you', 1),
 ('will', 1),
 ('never', 1),
 ('know', 1),
 ('If', 1),
 ('you', 1),
 ('know', 1),
 ('you', 1),
 ('need', 1),
 ('only', 1),
 ('ask', 1),
 ('The', 1),
 ('hidden', 1),
 ('room', 1),
 ('of', 1),
 ('requirement', 1)]

#### Apply reduceByKey to complete the counting

In [10]:
words_count = words_map.reduceByKey(lambda a,b: a+b)
words_count.collect()

[('The', 2),
 ('where', 1),
 ('things', 1),
 ('are', 1),
 ('hidden', 2),
 ('have', 1),
 ('ask', 2),
 ('never', 1),
 ('know', 2),
 ('only', 1),
 ('room', 1),
 ('of', 1),
 ('place', 1),
 ('If', 2),
 ('you', 4),
 ('to', 1),
 ('will', 1),
 ('need', 1),
 ('requirement', 1)]

#### Sort them in the order of counts

In [12]:
#words_count_sorted_freq = words_count.sortBy(lambda x: x[1])
words_count_sorted_freq = words_count.sortBy(lambda x: -x[1]) # Descending
words_count_sorted_freq.collect()

[('you', 4),
 ('The', 2),
 ('hidden', 2),
 ('ask', 2),
 ('know', 2),
 ('If', 2),
 ('where', 1),
 ('things', 1),
 ('are', 1),
 ('have', 1),
 ('never', 1),
 ('only', 1),
 ('room', 1),
 ('of', 1),
 ('place', 1),
 ('to', 1),
 ('will', 1),
 ('need', 1),
 ('requirement', 1)]

#### ----

### A different scenario

In [13]:
# name of the file is GoT.txt

rdd_got = sc.textFile('data/GOT.txt')
rdd_got.collect()

['A Lannister always pays his debts.',
 'The night is dark and full of terrors.',
 'The next time you raise a hand to me will be the last time you have hands.',
 'When you play the Game of Thrones you win or you die.',
 'You know nothing Jon Snow.']

In [14]:
# data cleaning and preparation: get rid of the full stop at the end of sentences, convert everything to lowercase
# use 'replace' function of python for replacing '.'
rdd_got = rdd_got.map(lambda line: line.replace('.', '')).map(lambda line: line.lower())
rdd_got.collect()

['a lannister always pays his debts',
 'the night is dark and full of terrors',
 'the next time you raise a hand to me will be the last time you have hands',
 'when you play the game of thrones you win or you die',
 'you know nothing jon snow']

### Now let's try to calculate the frequency of the words "per document" instead of the whole file

In [15]:
rdd_parsed = rdd_got.map(lambda x: x.split())
rdd_parsed.collect()

[['a', 'lannister', 'always', 'pays', 'his', 'debts'],
 ['the', 'night', 'is', 'dark', 'and', 'full', 'of', 'terrors'],
 ['the',
  'next',
  'time',
  'you',
  'raise',
  'a',
  'hand',
  'to',
  'me',
  'will',
  'be',
  'the',
  'last',
  'time',
  'you',
  'have',
  'hands'],
 ['when',
  'you',
  'play',
  'the',
  'game',
  'of',
  'thrones',
  'you',
  'win',
  'or',
  'you',
  'die'],
 ['you', 'know', 'nothing', 'jon', 'snow']]

In [16]:
def count_words_in_docs(doc_record):
    counts = {}
    for word in doc_record:  # Looping, Why?
        if word not in counts:
            counts[word] = 1
        else:
            counts[word] += 1
    return list(counts.items())

In [17]:
rdd_loop_counts = rdd_parsed.map(count_words_in_docs)
rdd_loop_counts.collect()

[[('a', 1),
  ('lannister', 1),
  ('always', 1),
  ('pays', 1),
  ('his', 1),
  ('debts', 1)],
 [('the', 1),
  ('night', 1),
  ('is', 1),
  ('dark', 1),
  ('and', 1),
  ('full', 1),
  ('of', 1),
  ('terrors', 1)],
 [('the', 2),
  ('next', 1),
  ('time', 2),
  ('you', 2),
  ('raise', 1),
  ('a', 1),
  ('hand', 1),
  ('to', 1),
  ('me', 1),
  ('will', 1),
  ('be', 1),
  ('last', 1),
  ('have', 1),
  ('hands', 1)],
 [('when', 1),
  ('you', 3),
  ('play', 1),
  ('the', 1),
  ('game', 1),
  ('of', 1),
  ('thrones', 1),
  ('win', 1),
  ('or', 1),
  ('die', 1)],
 [('you', 1), ('know', 1), ('nothing', 1), ('jon', 1), ('snow', 1)]]

In [18]:
rdd_counts_docids = rdd_loop_counts.zipWithIndex() # Generate the positional index of each record, record ids are hard to track sometimes
rdd_counts_docids.collect()

[([('a', 1),
   ('lannister', 1),
   ('always', 1),
   ('pays', 1),
   ('his', 1),
   ('debts', 1)],
  0),
 ([('the', 1),
   ('night', 1),
   ('is', 1),
   ('dark', 1),
   ('and', 1),
   ('full', 1),
   ('of', 1),
   ('terrors', 1)],
  1),
 ([('the', 2),
   ('next', 1),
   ('time', 2),
   ('you', 2),
   ('raise', 1),
   ('a', 1),
   ('hand', 1),
   ('to', 1),
   ('me', 1),
   ('will', 1),
   ('be', 1),
   ('last', 1),
   ('have', 1),
   ('hands', 1)],
  2),
 ([('when', 1),
   ('you', 3),
   ('play', 1),
   ('the', 1),
   ('game', 1),
   ('of', 1),
   ('thrones', 1),
   ('win', 1),
   ('or', 1),
   ('die', 1)],
  3),
 ([('you', 1), ('know', 1), ('nothing', 1), ('jon', 1), ('snow', 1)], 4)]

### User Exercise: Try to calculate the frequency of the term 'csc' , "across all" documents

In [19]:
rdd_csc = sc.parallelize([
    ('Csc-doc1-na'),
    ('csc-doc2-na'),
    ('cSc-doc2-na'),
    ('csC-doc3-na'),
    ('csc-doc3-na'),
    ('Csc-doc3-na'),
    ('csc-doc4-na'),
    ('cSc-doc4-na'),
    ('CSC-doc4-na'),
    ('csc-doc4-na'),
])

#### NOTE: Please get rid of the term 'na' after you have done the pre-processing and splitting

In [45]:
rdd_csc1 = rdd_csc.map(lambda x:x.lower()).map(lambda x: ((x.split('-')[0], x.split('-')[1]),1)) 
rdd_csc1.collect()

[(('csc', 'doc1'), 1),
 (('csc', 'doc2'), 1),
 (('csc', 'doc2'), 1),
 (('csc', 'doc3'), 1),
 (('csc', 'doc3'), 1),
 (('csc', 'doc3'), 1),
 (('csc', 'doc4'), 1),
 (('csc', 'doc4'), 1),
 (('csc', 'doc4'), 1),
 (('csc', 'doc4'), 1)]

In [59]:
rdd_csc2 = rdd_csc1.reduceByKey(lambda a,b: a+b)
rdd_csc2.collect()

[(('csc', 'doc3'), 3),
 (('csc', 'doc4'), 4),
 (('csc', 'doc1'), 1),
 (('csc', 'doc2'), 2)]

In [None]:
# I can go as deeper as the index allows me

In [61]:
rdd_csc2.sortBy(lambda x: x[0][1]).collect()

[(('csc', 'doc1'), 1),
 (('csc', 'doc2'), 2),
 (('csc', 'doc3'), 3),
 (('csc', 'doc4'), 4)]

In [66]:
rdd_csc1.countByKey()

defaultdict(int,
            {('csc', 'doc1'): 1,
             ('csc', 'doc2'): 2,
             ('csc', 'doc3'): 3,
             ('csc', 'doc4'): 4})