## Spell Corrector

In [3]:
import re
from collections import Counter

In [4]:
# function to tokenize words
def words(document):
    "convert text to lower case and token the document"
    return re.findall(r'\w+', document.lower())

In [6]:
# create a frequency table with all the words of the document
all_words=Counter(words(open('big.txt').read()))

In [7]:
print(len(words(open('big.txt').read())))
print(len(all_words))

1115585
32198


In [12]:
#check the frequency of a random word  say chair
all_words['chair']

135

In [14]:
#look at the top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [16]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets  = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i],word[i:])                     for i in range(len(word)+1)]                           #[('', 'abc'), ('a', 'bc'), ('ab', 'c'), ('abc', '')]
    deletes    = [left + right[1:]                        for left,right in splits if right]                     #['bc', 'ac', 'ab']
    inserts    = [left + c+ right                         for left,right in splits for c in alphabets]           #['aabc', 'abac', 'abca']
    replaces   = [left + c+ right[1:]                     for left,right in splits  if right for c in alphabets] #['xbc', 'axc', 'abx']
    transposes = [left + right[1] + right[0] + right[2:]  for left,right in splits if len(right)>1]              #['bac', 'acb']
    return set(deletes + inserts + replaces + transposes )

In [17]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [18]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [19]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [20]:
def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [21]:
print(len(set(edits_one("monney"))))
print(edits_one("monney"))

336
{'mxonney', 'monneey', 'monnxey', 'mvonney', 'mvnney', 'menney', 'morney', 'moneey', 'monhey', 'monnewy', 'monneyz', 'monnky', 'monniey', 'moynney', 'mrnney', 'lonney', 'monneyp', 'monnxy', 'monneky', 'monnley', 'monneys', 'moxney', 'monnhey', 'monhney', 'mobney', 'mofnney', 'mwonney', 'monaney', 'monvey', 'moqney', 'monned', 'wmonney', 'moinney', 'motnney', 'monnev', 'monneya', 'monneyv', 'monyey', 'monjey', 'mounney', 'monnpey', 'ionney', 'monnqey', 'monrney', 'monwey', 'sonney', 'mmonney', 'mongey', 'mhnney', 'mowney', 'zmonney', 'monxney', 'qonney', 'monnef', 'monnty', 'monnbey', 'bmonney', 'monnzey', 'monneyw', 'moeney', 'moncey', 'mfonney', 'pmonney', 'mjonney', 'monneyn', 'mopnney', 'monnefy', 'monnei', 'mojney', 'mownney', 'mqnney', 'monwney', 'monnen', 'monnly', 'motney', 'monuey', 'monqney', 'amonney', 'mgnney', 'molnney', 'mosnney', 'momnney', 'nonney', 'mobnney', 'imonney', 'myonney', 'onney', 'monndey', 'monntey', 'mooney', 'monnee', 'monneyg', 'monjney', 'moniney', 'm

In [22]:
print(known(edits_one("monney")))

{'monkey', 'money'}


In [23]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_one("monney")))

51013
{'monkey', 'money'}


In [24]:
# Let's look at possible corrections of a word
print(possible_corrections("monney"))

{'monkey', 'money'}


In [25]:
# Let's look at probability of a word
print(prob("money"))
print(prob("monkey"))

0.0002922233626303688
5.378344097491451e-06


In [26]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [27]:
# test spell check
print(spell_check("monney"))

Did you mean money?
