## File input and unique data separation

In [40]:
import re

In [41]:
words = []
vocab = []

with open('big.txt','r') as fd:
    lines = fd.readlines()
    for line in lines:
        words += re.findall(r'\w+', line.lower())

print("Total number of words ",len(words))
vocab = list(set(words))
print("Total number of unique words ",len(vocab))

print(vocab[:100])

Total number of words  1115585
Total number of unique words  32198
['freesoil', 'rustling', 'oven', 'buttock', 'habit', 'franklin_', 'zakharych', 'scant', 'selle', 'thudding', 'tour', 'furieuse', 'significant', 'citizen', 'hopelessly', 'audible', 'outbreaks', 'moistened', 'suave', 'lacrymal', 'everybody', 'thinkers', 'underline', 'engendered', '_granulomata_', 'internode', 'raft', 'promoters', '1680', '1760', 'hey', 'adults', 'solemnity', 'suggest', 'burnoose', 'foaming', '2003', 'lambskin', 'initiatives', 'apraksins', 'wicker', 'dot', 'refugees', 'ta', 'elets', 'study', 'amputating', '_endothelioma_', 'introduction', 'ulyulyulyu', 'amuses', 'anti', 'solidified', 'correcting', 'roses', 'spain', 'turtle', 'regent', 'try', 'butts', 'rashes', 'irresistible', 'seeming', 'persevered', 'incisors', 'teamster', 'unquestioning', 'eyeglasses', '281', 'chagrined', 'scathing', 'suffice', 'hypodermic', 'palsies', '99ff', 'tproo', 'tuberculosis_', 'tag', 'wherefore', 'sanguinary', 'wonderful', 'conf

## Probability distribution of each word

In [42]:
words.count("is")

9773

In [43]:
from tqdm import tqdm

In [61]:
probability_distribution = dict()

for word in tqdm(vocab):
    probability_distribution[word] = float(words.count(word) / len(words))

  0%|          | 0/32198 [00:00<?, ?it/s]

100%|██████████| 32198/32198 [11:23<00:00, 47.09it/s]


In [62]:
print(len(probability_distribution))

for word in vocab[:10]:
    print(word,":",probability_distribution[word])

print("the",probability_distribution["the"])

32198
freesoil : 8.963906829152417e-07
rustling : 6.2747347804066925e-06
oven : 6.2747347804066925e-06
buttock : 2.061698570705056e-05
habit : 4.9301487560338295e-05
franklin_ : 8.963906829152417e-07
zakharych : 1.7927813658304835e-06
scant : 3.585562731660967e-06
selle : 8.963906829152417e-07
thudding : 8.963906829152417e-07
the 0.07154004401278254


## Word Modification

In [70]:
def split(word):
    output = []
    for i in range(len(word)+1):
        l = word[:i]
        r = word[i:]
        output.append([l,r])
    return output

In [71]:
data = split("happy")
data

[['', 'happy'],
 ['h', 'appy'],
 ['ha', 'ppy'],
 ['hap', 'py'],
 ['happ', 'y'],
 ['happy', '']]

#### <font color="orange">1. Deletion</font>

thae -> the

In [80]:
def delete(word):
    output = []

    for l,r in split(word):
        output+=[l+r[1:]]

    return output

delete("happy")

['appy', 'hppy', 'hapy', 'hapy', 'happ', 'happy']

#### <font color="orange">2. Swap</font>

teh -> the

In [83]:
def swap(word):
    output = []

    for l,r in split(word):
        if len(r)>1:
            output+=[l+r[1]+r[0]+r[2:]]

    return output

swap("happy")

['ahppy', 'hpapy', 'happy', 'hapyp']

#### <font color="orange">3. Replace</font>

tha -> the

In [84]:
characters = 'abcdefghijklmnopqrstuvwxyz'

In [85]:
def replace(word):
    output = []

    for l,r in split(word):
        for ch in characters:
            output += [l+ch+r[1:]]

    return output

replace("happy")

['aappy',
 'bappy',
 'cappy',
 'dappy',
 'eappy',
 'fappy',
 'gappy',
 'happy',
 'iappy',
 'jappy',
 'kappy',
 'lappy',
 'mappy',
 'nappy',
 'oappy',
 'pappy',
 'qappy',
 'rappy',
 'sappy',
 'tappy',
 'uappy',
 'vappy',
 'wappy',
 'xappy',
 'yappy',
 'zappy',
 'happy',
 'hbppy',
 'hcppy',
 'hdppy',
 'heppy',
 'hfppy',
 'hgppy',
 'hhppy',
 'hippy',
 'hjppy',
 'hkppy',
 'hlppy',
 'hmppy',
 'hnppy',
 'hoppy',
 'hpppy',
 'hqppy',
 'hrppy',
 'hsppy',
 'htppy',
 'huppy',
 'hvppy',
 'hwppy',
 'hxppy',
 'hyppy',
 'hzppy',
 'haapy',
 'habpy',
 'hacpy',
 'hadpy',
 'haepy',
 'hafpy',
 'hagpy',
 'hahpy',
 'haipy',
 'hajpy',
 'hakpy',
 'halpy',
 'hampy',
 'hanpy',
 'haopy',
 'happy',
 'haqpy',
 'harpy',
 'haspy',
 'hatpy',
 'haupy',
 'havpy',
 'hawpy',
 'haxpy',
 'haypy',
 'hazpy',
 'hapay',
 'hapby',
 'hapcy',
 'hapdy',
 'hapey',
 'hapfy',
 'hapgy',
 'haphy',
 'hapiy',
 'hapjy',
 'hapky',
 'haply',
 'hapmy',
 'hapny',
 'hapoy',
 'happy',
 'hapqy',
 'hapry',
 'hapsy',
 'hapty',
 'hapuy',
 'hapvy',


#### <font color="orange">4. Insertion</font>

th -> the

In [86]:
def insertion(word):
    output = []

    for l,r in split(word):
        for ch in characters:
            output += [l+ch+r]

    return output

insertion("happy")

['ahappy',
 'bhappy',
 'chappy',
 'dhappy',
 'ehappy',
 'fhappy',
 'ghappy',
 'hhappy',
 'ihappy',
 'jhappy',
 'khappy',
 'lhappy',
 'mhappy',
 'nhappy',
 'ohappy',
 'phappy',
 'qhappy',
 'rhappy',
 'shappy',
 'thappy',
 'uhappy',
 'vhappy',
 'whappy',
 'xhappy',
 'yhappy',
 'zhappy',
 'haappy',
 'hbappy',
 'hcappy',
 'hdappy',
 'heappy',
 'hfappy',
 'hgappy',
 'hhappy',
 'hiappy',
 'hjappy',
 'hkappy',
 'hlappy',
 'hmappy',
 'hnappy',
 'hoappy',
 'hpappy',
 'hqappy',
 'hrappy',
 'hsappy',
 'htappy',
 'huappy',
 'hvappy',
 'hwappy',
 'hxappy',
 'hyappy',
 'hzappy',
 'haappy',
 'habppy',
 'hacppy',
 'hadppy',
 'haeppy',
 'hafppy',
 'hagppy',
 'hahppy',
 'haippy',
 'hajppy',
 'hakppy',
 'halppy',
 'hamppy',
 'hanppy',
 'haoppy',
 'happpy',
 'haqppy',
 'harppy',
 'hasppy',
 'hatppy',
 'hauppy',
 'havppy',
 'hawppy',
 'haxppy',
 'hayppy',
 'hazppy',
 'hapapy',
 'hapbpy',
 'hapcpy',
 'hapdpy',
 'hapepy',
 'hapfpy',
 'hapgpy',
 'haphpy',
 'hapipy',
 'hapjpy',
 'hapkpy',
 'haplpy',
 'hapmpy',

## <font color="green">Prediction</font>

In [117]:
def check_word(word):

    # get all possible solutions
    possible_solutions = []
    possible_solutions.extend(delete(word))
    possible_solutions.extend(swap(word))
    possible_solutions.extend(replace(word))
    possible_solutions.extend(insertion(word))

    # get unique solutions
    possible_solutions = list(set(possible_solutions))

    # search that words are also in vocab
    solutions = []

    for word in possible_solutions:
        if word in probability_distribution.keys():
            solutions.append([word, probability_distribution[word]])
    
    # sort based on frequency of occurrence
    solutions.sort(key=lambda x: x[1])
    
    # create the answer list
    answer = []
    for word in range(len(solutions)-1,-1,-1):
        answer.append(solutions[word][0])
    
    return answer[0:5]

check_word("tha")

['the', 'that', 'than', 'tea', 'ha']

In [120]:
check_word("hapey")

['happy', 'haney']

In [121]:
check_word("sod")

['so', 'god', 'son', 'sad', 'sold']

In [122]:
check_word("necessory")

['necessary']