In [None]:
# Install mrjob if not already installed
!pip install mrjob



In [None]:
# !pip install pyspark

In [None]:
# # Import necessary libraries
# from mrjob.job import MRJob
# from pyspark.sql import SparkSession

# # Create a SparkSession
# spark = SparkSession.builder \
#     .appName("WordCount") \
#     .getOrCreate()

In [None]:
%%file wc.py
from mrjob.job import MRJob
import re

WORD_RE = re.compile(r"[\w']+")

class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
      for word in WORD_RE.findall(line):
            # Emit each word with a count of 1
            yield word.lower(), 1

    def reducer(self, key, values):
        yield key, sum(values)


if __name__ == '__main__':
    MRWordFrequencyCount.run()

Overwriting wc.py


In [None]:
import wc, re

mr_job = wc.MRWordFrequencyCount(args=['/content/text_01_hw02.txt'])
with mr_job.make_runner() as runner:
     runner.run()
     for key, value in mr_job.parse_output(runner.cat_output()):
       print(key, value)



fright 1
from 10
front 2
frozen 1
fulfill 1
full 2
fully 1
furrowed 1
fuss 1
g 1
gap 2
gigantic 1
give 1
glimmering 1
glimpse 1
gnarled 1
go 4
goblet 9
going 4
gold 1
gone 3
good 2
got 1
grasping 1
grate 1
graying 1
greatly 1
grip 1
gripping 1
groped 1
growing 1
had 21
hair 1
hairs 1
hall 2
hand 3
hands 1
hard 2
harry 15
has 5
have 16
having 1
he 41
head 4
health 1
hear 4
heard 6
hearth 1
heavy 1
her 8
here 5
hidden 1
hide 1
high 1
him 10
himself 5
his 38
hiss 2
hissed 1
hissing 2
hoarse 1
hogwarts 1
hold 1
holidays 1
honor 1
hook 1
horrible 1
horrified 1
hot 2
hours 1
house 6
how 3
however 1
hungry 1
hurried 1
i 61
icy 1
idea 3
identities 1
if 14
impossible 1
in 29
inclined 1
incoherently 1
incredibly 1
indeed 2
information 2
inns 1
inserted 1
inside 4
inspiration 1
insult 1
intently 1
interest 1
interesting 1
into 13
intruders 1
invaluable 1
invite 1
is 20
it 32
its 4
ivy 1
j 9
jorkins 3
journey 2
judging 1
just 1
k 9
kettle 2
key 2
kill 3
killed 2
kind 1
kitchen 3
knee 1
knew 2
know

In [None]:
!pip install pyspellchecker



In [None]:
%%file wc_non_eng.py

import re
from mrjob.job import MRJob
from spellchecker import SpellChecker

# Regular expression pattern to extract words
WORD_RE = re.compile(r"[\w']+")

class MRNonEnglishWordCount(MRJob):

    def mapper(self, _, line):
        # Split the line into words using regular expression
        spell = SpellChecker(language='en')

        for word in WORD_RE.findall(line):
            # Check if the word is non-English
            if word.lower() not in spell:
                # Emit the non-English word with a count of 1
                yield word.lower(), 1

    def reducer(self, word, counts):
        # Sum up the counts for each non-English word
        yield word, sum(counts)

if __name__ == '__main__':
    MRNonEnglishWordCount.run()

Overwriting wc_non_eng.py


In [None]:
import wc_non_eng

mr_job = wc_non_eng.MRNonEnglishWordCount(args=['/content/txt_02_hw02.txt'])

with mr_job.make_runner() as runner:
     runner.run()
     for key, value in mr_job.parse_output(runner.cat_output()):
       print(key, value)



george 1
gladrags 1
hadn 1
hermione 5
hogsmeade 1
hogwarts 2
irish 1
krum 2
lanternlit 1
ll 6
london 1
malfoy 1
mr 16
mrs 1
muggle 2
muggles 1
omnioculars 4
oooh 1
paris 1
percy 5
portkeys 1
quidditch 1
ron 9
rowling 10
saleswizard 1
shouldn 1
skower 1
undrunk 1
ve 6
viktor 1
wasn 1
weasley 10
weasleys 1
weatherby 2
winky 10
wizardwear 1
100 1
101 1
102 1
103 1
104 1
105 1
106 1
107 1
108 1
109 1
ali 1
apparating 1
aren 1
arthur 1
axminster 1
barty 4
batlike 1
britain 1
bulgarian 4
bulgarians 1
charmable 1
christmas 1
cornelius 2
couldn 1
didn 1
disapparated 1
firebolts 1
fred 3
