<a href="https://colab.research.google.com/github/Anik-Adnan/Natural-Language-Processing/blob/main/spell_corrector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import re
from collections import Counter

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [15]:
# create a frequency table of all the words of the document
all_words = Counter(words(open('/content/drive/MyDrive/Colab-Notebooks/seed-document.txt').read()))

In [16]:
# check frequency of a random word, say, 'chair'
all_words['chair']

135

In [17]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [18]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    deletes    = [left + right[1:]                       for left, right in splits if right]
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

# Go ahead and pass the word ‘emfasize’ to the function edits_one(). What is the size of the list returned by this function?

In [32]:
len(edits_one("emfasize"))

442

In [19]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

# Go ahead and pass the word ‘emfasize’ to the function edits_two(). What is the total number of uniue words returned by this function?

In [38]:
unique_words = set(edits_two("emfasize"))
len(unique_words)


90902

In [20]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [21]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

# Go ahead and pass the word ‘emfasize’ to the function possible_corrections(). How many words does it return?

In [39]:
len(possible_corrections("emfasize"))

1

In [22]:
def prob(word, N=sum(all_words.values())):
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [23]:
print(len(set(edits_one("monney"))))
print(edits_one("monney"))

336
{'tmonney', 'monnep', 'monnxy', 'mobney', 'sonney', 'monnwy', 'monneym', 'qmonney', 'oonney', 'mognney', 'mxonney', 'monbey', 'monnney', 'monnez', 'monjey', 'mogney', 'monkney', 'monneu', 'monneyq', 'monnek', 'moknney', 'mzonney', 'monnmy', 'mconney', 'monntey', 'mnoney', 'monbney', 'monneyl', 'monneya', 'montey', 'monuney', 'monnhy', 'monneny', 'monsey', 'money', 'mnonney', 'mosney', 'monoey', 'mqonney', 'monqney', 'gonney', 'ymonney', 'monnety', 'monnfy', 'fonney', 'mdnney', 'smonney', 'monnbey', 'monneyc', 'mondney', 'mznney', 'monneyg', 'monngy', 'moznney', 'omonney', 'umonney', 'xmonney', 'monniy', 'mvnney', 'monaney', 'wmonney', 'moncey', 'mozney', 'moxney', 'monncey', 'monneky', 'monyey', 'monneiy', 'monnqy', 'moenney', 'monnefy', 'honney', 'monnmey', 'moonney', 'mpnney', 'monnee', 'msnney', 'kmonney', 'monnpey', 'muonney', 'monnvy', 'monneyh', 'mkonney', 'motnney', 'mvonney', 'monner', 'monnby', 'motney', 'mxnney', 'monny', 'monnwey', 'monnewy', 'mocnney', 'lmonney', 'monne

In [24]:
print(known(edits_one("monney")))

{'monkey', 'money'}


In [25]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_one("monney")))

51013
{'monkey', 'money'}


In [26]:
# Let's look at possible corrections of a word
print(possible_corrections("monney"))

{'monkey', 'money'}


In [27]:
# Let's look at probability of a word
print(prob("money"))
print(prob("monkey"))

0.0002922233626303688
5.378344097491451e-06


In [28]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [29]:
# test spell check
print(spell_check("monney"))

Did you mean money?


In [47]:
import re
from collections import Counter

def words(document):
    "Convert text to lower case and tokenize the document"
    return re.findall(r'\w+', document.lower())

# create a frequency table of all the words of the document
all_words = Counter(words(open('/content/drive/MyDrive/Colab-Notebooks/seed-document.txt').read()))

def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    deletes    = [left + right[1:]                       for left, right in splits if right]
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

def prob(word, N=sum(all_words.values())):
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

def rectify(word):
    "return the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    return correct_word

In [49]:
from google.colab import drive
import sys

# Mount Google Drive
drive.mount('/content/drive')

# Correct: Append the folder containing spell_corrector.py, not the file itself
sys.path.append("/content/drive/MyDrive/Colab-Notebooks/")  # <-- directory

# Import the rectify function
from spell_corrector import rectify

# Test the spell corrector
correct = rectify("laern")
print(correct)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
learn
