In [2]:
import string
import re
import numpy as np
from collections import Counter


In [3]:
def read_corpus(filename):
    with open(filename,'r',encoding='utf-8') as file:
        lines = file.readlines()
        
        words = []
        for word in lines:
            words += re.findall(r'\w+',word.lower())
    return words

# invoke this function
corpus = read_corpus(r't8.shakespeare.txt')

In [4]:
len(corpus)

929396

In [5]:
vocab = set(corpus)
len(vocab)

23902

In [6]:
words_count = Counter(corpus)
words_count

Counter({'this': 6853,
         'is': 9784,
         'the': 27660,
         '100th': 1,
         'etext': 245,
         'file': 20,
         'presented': 18,
         'by': 4476,
         'project': 251,
         'gutenberg': 236,
         'and': 26784,
         'in': 11123,
         'cooperation': 2,
         'with': 8016,
         'world': 905,
         'library': 232,
         'inc': 224,
         'from': 2654,
         'their': 2077,
         'of': 18191,
         'future': 17,
         'shakespeare': 268,
         'cdroms': 1,
         'often': 124,
         'releases': 1,
         'etexts': 8,
         'that': 11549,
         'are': 3894,
         'not': 8740,
         'placed': 11,
         'public': 57,
         'domain': 4,
         'has': 388,
         'certain': 176,
         'copyright': 228,
         'implications': 1,
         'you': 13860,
         'should': 1580,
         'read': 207,
         'electronic': 443,
         'version': 222,
         'complete': 246,
       

In [7]:
total_words_count = float(sum(words_count.values()))

In [8]:
word_probabs = {word:words_count[word] / total_words_count for word in words_count.keys()}

In [9]:
word_probabs['the']

0.029761264304989477

In [10]:
def split(word): # why
    return [ (word[:i], word[i:])  for i in range(len(word) + 1)]

In [12]:
print(split('shakespeare'))

[('', 'shakespeare'), ('s', 'hakespeare'), ('sh', 'akespeare'), ('sha', 'kespeare'), ('shak', 'espeare'), ('shake', 'speare'), ('shakes', 'peare'), ('shakesp', 'eare'), ('shakespe', 'are'), ('shakespea', 're'), ('shakespear', 'e'), ('shakespeare', '')]


In [13]:
def delete(word):
    return [left + right[1:] for left,right in split(word) if right]

In [14]:
print(delete('shakespeare'))

['hakespeare', 'sakespeare', 'shkespeare', 'shaespeare', 'shakspeare', 'shakepeare', 'shakeseare', 'shakespare', 'shakespere', 'shakespeae', 'shakespear']


In [15]:
def swap(word):
    return [left + right[1] + right[0] + right[2:] for left,right in split(word) if len(right) > 1 ]

In [16]:
print(swap('shakespeare'))

['hsakespeare', 'sahkespeare', 'shkaespeare', 'shaekspeare', 'shaksepeare', 'shakepseare', 'shakesepare', 'shakespaere', 'shakesperae', 'shakespeaer']


In [17]:
def replace(word): # abcdef...z
    return [left + center + right[1:] for left, right in split(word) if right for center in string.ascii_lowercase]

In [18]:
print(replace('shakespeare'))

['ahakespeare', 'bhakespeare', 'chakespeare', 'dhakespeare', 'ehakespeare', 'fhakespeare', 'ghakespeare', 'hhakespeare', 'ihakespeare', 'jhakespeare', 'khakespeare', 'lhakespeare', 'mhakespeare', 'nhakespeare', 'ohakespeare', 'phakespeare', 'qhakespeare', 'rhakespeare', 'shakespeare', 'thakespeare', 'uhakespeare', 'vhakespeare', 'whakespeare', 'xhakespeare', 'yhakespeare', 'zhakespeare', 'saakespeare', 'sbakespeare', 'scakespeare', 'sdakespeare', 'seakespeare', 'sfakespeare', 'sgakespeare', 'shakespeare', 'siakespeare', 'sjakespeare', 'skakespeare', 'slakespeare', 'smakespeare', 'snakespeare', 'soakespeare', 'spakespeare', 'sqakespeare', 'srakespeare', 'ssakespeare', 'stakespeare', 'suakespeare', 'svakespeare', 'swakespeare', 'sxakespeare', 'syakespeare', 'szakespeare', 'shakespeare', 'shbkespeare', 'shckespeare', 'shdkespeare', 'shekespeare', 'shfkespeare', 'shgkespeare', 'shhkespeare', 'shikespeare', 'shjkespeare', 'shkkespeare', 'shlkespeare', 'shmkespeare', 'shnkespeare', 'shokespe

In [19]:
def insert(word): # abcdef...z
    return [left + center + right[1:] for left, right in split(word) for center in string.ascii_lowercase]

In [21]:
print(replace('monkey'))

['aonkey', 'bonkey', 'conkey', 'donkey', 'eonkey', 'fonkey', 'gonkey', 'honkey', 'ionkey', 'jonkey', 'konkey', 'lonkey', 'monkey', 'nonkey', 'oonkey', 'ponkey', 'qonkey', 'ronkey', 'sonkey', 'tonkey', 'uonkey', 'vonkey', 'wonkey', 'xonkey', 'yonkey', 'zonkey', 'mankey', 'mbnkey', 'mcnkey', 'mdnkey', 'menkey', 'mfnkey', 'mgnkey', 'mhnkey', 'minkey', 'mjnkey', 'mknkey', 'mlnkey', 'mmnkey', 'mnnkey', 'monkey', 'mpnkey', 'mqnkey', 'mrnkey', 'msnkey', 'mtnkey', 'munkey', 'mvnkey', 'mwnkey', 'mxnkey', 'mynkey', 'mznkey', 'moakey', 'mobkey', 'mockey', 'modkey', 'moekey', 'mofkey', 'mogkey', 'mohkey', 'moikey', 'mojkey', 'mokkey', 'molkey', 'momkey', 'monkey', 'mookey', 'mopkey', 'moqkey', 'morkey', 'moskey', 'motkey', 'moukey', 'movkey', 'mowkey', 'moxkey', 'moykey', 'mozkey', 'monaey', 'monbey', 'moncey', 'mondey', 'moneey', 'monfey', 'mongey', 'monhey', 'moniey', 'monjey', 'monkey', 'monley', 'monmey', 'monney', 'monoey', 'monpey', 'monqey', 'monrey', 'monsey', 'montey', 'monuey', 'monvey',

In [22]:
def level_one_edits(word):
    return set((delete(word) + swap(word) + replace(word) + insert(word)))

In [23]:
print(level_one_edits('load'))

{'hoad', 'uoad', 'lotd', 'loxd', 'noad', 'loadb', 'liad', 'joad', 'lomd', 'loag', 'ltad', 'lgad', 'loae', 'laad', 'lokd', 'ooad', 'loar', 'ioad', 'lkad', 'lcad', 'loado', 'loaz', 'loan', 'voad', 'lead', 'lozd', 'lfad', 'poad', 'loas', 'luad', 'loadr', 'loam', 'doad', 'loaq', 'ljad', 'loda', 'loadh', 'loyd', 'loady', 'road', 'loadm', 'ldad', 'yoad', 'olad', 'lowd', 'lqad', 'toad', 'loadx', 'qoad', 'loade', 'soad', 'lnad', 'loap', 'goad', 'loao', 'loah', 'woad', 'loadi', 'loai', 'loadj', 'lrad', 'loax', 'lodd', 'loay', 'aoad', 'lzad', 'loadu', 'loaj', 'lpad', 'lood', 'lobd', 'lold', 'losd', 'loaa', 'lyad', 'loak', 'lod', 'lbad', 'laod', 'loud', 'lond', 'moad', 'lord', 'loadd', 'loadq', 'loa', 'lofd', 'lojd', 'koad', 'loada', 'lwad', 'loal', 'loadt', 'eoad', 'lmad', 'loads', 'lopd', 'lvad', 'loab', 'loadv', 'loadw', 'boad', 'loav', 'loid', 'loadp', 'zoad', 'lad', 'loac', 'loadf', 'loadz', 'loat', 'loadg', 'locd', 'loaw', 'loadk', 'loaf', 'oad', 'llad', 'loadc', 'loadn', 'lohd', 'lhad', 'l

In [24]:
def level_two_edits(word):
    return set(e2  for e1 in level_one_edits(word) for e2 in level_one_edits(e1))

In [25]:
print(level_two_edits('cut'))

{'wutr', 'cutzt', 'uts', 'wmt', 'butw', 'ceu', 'lur', 'cuttm', 'bmt', 'zuty', 'zua', 'hutr', 'cutmm', 'vutl', 'cuvh', 'ilt', 'cok', 'rutu', 'ikt', 'cktk', 'cutis', 'cutxe', 'coi', 'cusr', 'ctq', 'curq', 'uup', 'csc', 'cufm', 'czb', 'cukv', 'cuxz', 'cuthk', 'cutzu', 'vlt', 'chtn', 'cg', 'okt', 'uus', 'zutn', 'cmo', 'uutd', 'cyh', 'rug', 'cpth', 'vum', 'cdn', 'chtm', 'cuthd', 'hutg', 'drt', 'vuv', 'dkt', 'xuk', 'cxo', 'cubx', 'dutp', 'hute', 'cut', 'csh', 'qup', 'sxt', 'cutpc', 'cyv', 'cutru', 'rot', 'chf', 'cuof', 'gutf', 'cal', 'cutdi', 'cukw', 'cutgj', 'cusw', 'cbl', 'ujt', 'cgo', 'cutw', 'cutod', 'qkt', 'xutn', 'cutmq', 'cfto', 'quty', 'dutm', 'cbtj', 'cuxd', 'uug', 'cktu', 'vuz', 'cnu', 'cap', 'cbtv', 'cuhi', 'vutd', 'cdi', 'cuac', 'cucs', 'ruty', 'cutah', 'cbtp', 'citd', 'duj', 'ckk', 'cuzi', 'rpt', 'cktl', 'nuw', 'bit', 'qutv', 'yutb', 'wutp', 'cbta', 'oxt', 'utx', 'cytn', 'guth', 'crtp', 'ckh', 'cuuv', 'suw', 'ctut', 'cusa', 'ou', 'cuttv', 'quts', 'cata', 'cutnx', 'cuak', 'hqt', 

In [26]:
def correct_spelling(word,vocab,word_probabs):
    if word in vocab:
        print(f"{word} is already correctly spelled")
        return 
    #getting all suggesions
    suggestions = level_one_edits(word) or level_two_edits(word) or [word]
    best_guesses = [w for w in suggestions if w in vocab]
    return [(w, word_probabs[w]) for w in best_guesses]

In [36]:
search_word = "willima"
guess = correct_spelling(search_word,vocab,word_probabs)
print(guess)

[('william', 0.00037551269856982383)]
