In [98]:
import re
from functools import reduce
from google.colab import files


# My date of birth : 11/18/2000 (18th November 2000)
# Book Number: 6 (11/2 = 5.5 ~ 6)
# File1 = pages wrt birth date(18) :- PAGES(18-27)
# File2 = pages wrt birth year(2000) :- PAGES(100-109)
# Function to process text and map words to (word, 1)
def mapping_words_file1(txt_file1):
    # Converting the text to lowercase
    txt_file1 = txt_file1.lower()

    # Removing the punctuation **except** apostrophes inside the words
    cleaned_txt = ""
    for char in txt_file1:
        if re.match(r"[\w\s’'\-]", char):  # Keeping the word characters, spaces, apostrophes, and hyphens
            cleaned_txt += char
        else:
            cleaned_txt += " "  # Replacing the other characters with a space

    txt_file1 = cleaned_txt

    # Splitting text into words while keeping contractions and removing numbers
    wrds = []
    for wrd in txt_file1.split():
        if re.fullmatch(r"[a-zA-Z]+(['’][a-zA-Z]+)?", wrd):
            wrds.append(wrd)

    # Returning words as (word, 1) tuples
    return [(wrd, 1) for wrd in wrds]

# Reduce function: This will count the number of occurrences of each word
def reduce_words(cnt_of_wrds, new_wrd_cnts):
    for wrd, count in new_wrd_cnts:
        if wrd in cnt_of_wrds:
            cnt_of_wrds[wrd] += count
        else:
            cnt_of_wrds[wrd] = count

    return cnt_of_wrds

# MapReduce function
def mapreduce(path_file1):
    # Read the content of the file
    with open(path_file1, 'r', encoding='utf-8') as file:
        txt_file1 = file.read()

    # Map step: extract words and map them to (word, 1)
    mapped_data = mapping_words_file1(txt_file1)

    # Reduce step: count occurrences of each word
    cnt_of_wrds = reduce(reduce_words, [mapped_data], {})

    return cnt_of_wrds

# Google Colab File Upload
print("Upload text file")  # Prompt to upload the file
upload_file1 = files.upload()  # Upload the file manually

# Getting the uploaded file name, because i need to upload only one file i'm using this
path_file1 = list(upload_file1.keys())[0]  # Get the uploaded file name (e.g., 'file1.txt')

# Runnig the MapReduce function with the uploaded file
cnt_of_wrds = mapreduce(path_file1)

# Printing the word counts
print(f"\nWord counts for file '{path_file1}':")
for word, count in cnt_of_wrds.items():
    print(f'{word}: {count}')


Upload text file


Saving file1.txt to file1.txt

Word counts for file 'file1.txt':
p: 10
a: 92
g: 10
e: 10
harry: 10
potter: 10
and: 59
the: 140
half: 10
blood: 10
prince: 10
j: 10
k: 10
rowling: 10
there: 8
were: 9
streaks: 1
of: 51
gray: 1
in: 31
his: 17
mane: 1
tawny: 1
hair: 3
bushy: 1
eyebrows: 1
he: 32
had: 19
keen: 1
yellowish: 1
eyes: 7
behind: 7
pair: 2
spectacles: 1
certain: 1
rangy: 1
loping: 1
grace: 1
even: 1
though: 7
walked: 1
with: 15
slight: 1
limp: 1
was: 19
an: 9
immediate: 1
impression: 2
shrewdness: 1
toughness: 1
prime: 16
minister: 16
thought: 3
understood: 1
why: 1
wizarding: 1
community: 1
preferred: 1
scrimgeour: 13
to: 50
fudge: 5
as: 22
leader: 1
these: 2
dangerous: 3
times: 1
how: 2
do: 8
you: 24
said: 26
politely: 1
holding: 2
out: 6
hand: 4
grasped: 1
it: 11
briefly: 1
scanning: 1
room: 4
then: 8
pulled: 1
wand: 7
from: 13
under: 6
robes: 1
told: 3
everything: 1
asked: 5
striding: 1
over: 5
door: 10
tapping: 1
keyhole: 1
heard: 3
lock: 1
click: 1
er: 2
yes: 3
if: 5
don’t: 

In [15]:
!pip install pyspellchecker


Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.2


In [92]:
from spellchecker import SpellChecker
from collections import defaultdict
import re

# Initializing the SpellChecker
spell_chkr = SpellChecker()

def if_it_is_non_english(wrd):
    #Checking if the word is non-English
    return wrd.lower() not in spell_chkr and wrd.lower()

def mapper(line):
    """Tokenizes a line and returns non-English words"""
    wrds = []
    matching_file2 = re.finditer(r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?\b", line)

    for match in matching_file2:  # Iterating over the matching
      wrds.append(match.group())  # Adding the matched word to the list


    filtered_wrds = []
    for wrd in wrds:
       if if_it_is_non_english(wrd):
         filtered_wrds.append(wrd.lower())

    return filtered_wrds


def reducer(mapped_data):
    """Aggregates counts for each non-English word"""
    wrd_counts = defaultdict(int)
    for wrd in mapped_data:
        wrd_counts[wrd] += 1
    return wrd_counts

# Reading the file content properly
path_file2 = "/content/file2.txt"  # path of the file

content_file2 = []
file = open(path_file2, "r", encoding="utf-8")  # Opening the file manually
try:
  # Reading the file line by line and storing it in the list
    for line in file:
        content_file2.append(line)
finally:
    file.close()  # Ensuring the file is closed after reading properly



# 1: Map-phase (processing the text line by line)
mapped_results = []
for each_line in content_file2:
    mapped_results.extend(mapper(each_line))  # Mapping each line

# 2: Reduce phase (aggregating the word counts)
final_counts = reducer(mapped_results)

# 3: Output of the final non-English words
# Creating a list of tuples sorted by count in descending order
sorted_cnts = []
for item in final_counts.items():
    sorted_cnts.append(item)

# Now sorting the list by the second element (count) in descending order
sorted_cnts.sort(key=lambda x: x[1], reverse=True)


# Iterating over the sorted list
for wrd, count in sorted_cnts:
    print(f"{wrd}: {count}")


hermione: 28
weasley: 12
fleur: 8
tonks: 6
arry: 4
ve: 4
umbridge: 3
ll: 3
isn: 3
eet: 2
hadn: 2
hasn: 2
wasn: 2
didn: 2
slytherin: 1
slughorn: 1
delacour: 1
im: 1
seester: 1
ze: 1
gringotts: 1
eenglish: 1
zere: 1
tchah: 1
auror: 1
triwizard: 1
azkaban: 1
lestrange: 1
couldn: 1
doesn: 1
diagon: 1
malfoy: 1
