In [11]:
import re
import pandas as pd
from tqdm import tqdm

#1.Finding unique words

In [12]:
with open('big.txt','r') as fd:
  lines = fd.readlines()
  words = []
  for line in lines:
    words += re.findall('\w+',line.lower())
print(len(words))
vocab = set(words)
print(len(vocab))

184982
12979


# 2.Finding probability distribution

In [13]:
word_probability = {}
for word in tqdm(vocab):
  word_probability[word] = float(words.count(word)/len(words))

100%|██████████| 12979/12979 [00:58<00:00, 220.47it/s]


In [14]:
len(word_probability)

12979

### 3. Text preprocessing
#### 3.1) Splitting and Deleting
"loive" -> "love"

In [29]:
def split(word):
  parts = []
  for i in range(len(word)+1):
    parts += [(word[:i], word[i:])]
  return parts
print(split('anurag'))

def delete(word):
  output = []
  for l,r in split(word):
    output.append(l+r[1:])
  return output
delete('heallo')

[('', 'anurag'), ('a', 'nurag'), ('an', 'urag'), ('anu', 'rag'), ('anur', 'ag'), ('anura', 'g'), ('anurag', '')]


['eallo', 'hallo', 'hello', 'healo', 'healo', 'heall', 'heallo']

#### 3.2) Swap
'lvoe' -> 'love'

In [30]:
def swap(word):
  output = []
  for l,r in split(word):
    if(len(r)>1):
      output.append(l + r[1]+r[0] + r[2:])
  return output
swap('lvoe')

['vloe', 'love', 'lveo']

#### 3.3) Replace
'lave' -> 'love'

In [31]:
def replace(word):
  output = []
  for l,r in split(word):
    for i in range(0,26):
      output.append(l+chr(97+i)+r[1:])
  return output
len(replace('lave'))

130

#### 3.4) Insert
'lve' -> 'love'

In [32]:
def insert(word):
    output = []
    for l,r in split(word):
        for i in range(0,26):
            output.append(l+chr(97+i)+r)
    return output
insert('lve')

['alve',
 'blve',
 'clve',
 'dlve',
 'elve',
 'flve',
 'glve',
 'hlve',
 'ilve',
 'jlve',
 'klve',
 'llve',
 'mlve',
 'nlve',
 'olve',
 'plve',
 'qlve',
 'rlve',
 'slve',
 'tlve',
 'ulve',
 'vlve',
 'wlve',
 'xlve',
 'ylve',
 'zlve',
 'lave',
 'lbve',
 'lcve',
 'ldve',
 'leve',
 'lfve',
 'lgve',
 'lhve',
 'live',
 'ljve',
 'lkve',
 'llve',
 'lmve',
 'lnve',
 'love',
 'lpve',
 'lqve',
 'lrve',
 'lsve',
 'ltve',
 'luve',
 'lvve',
 'lwve',
 'lxve',
 'lyve',
 'lzve',
 'lvae',
 'lvbe',
 'lvce',
 'lvde',
 'lvee',
 'lvfe',
 'lvge',
 'lvhe',
 'lvie',
 'lvje',
 'lvke',
 'lvle',
 'lvme',
 'lvne',
 'lvoe',
 'lvpe',
 'lvqe',
 'lvre',
 'lvse',
 'lvte',
 'lvue',
 'lvve',
 'lvwe',
 'lvxe',
 'lvye',
 'lvze',
 'lvea',
 'lveb',
 'lvec',
 'lved',
 'lvee',
 'lvef',
 'lveg',
 'lveh',
 'lvei',
 'lvej',
 'lvek',
 'lvel',
 'lvem',
 'lven',
 'lveo',
 'lvep',
 'lveq',
 'lver',
 'lves',
 'lvet',
 'lveu',
 'lvev',
 'lvew',
 'lvex',
 'lvey',
 'lvez']

### 4. Finding the prediction
#### 4.1) Combining possible words

In [45]:
def edit(word):
  return list(set(insert(word)+delete(word)+swap(word)+replace(word)))
len(edit('loave'))

286

#### 4.2) Predicting the word

In [46]:
def spell_check_edit_1(word, word_probability, count = 5):
  output = []

  suggested_words = edit(word)

  for wrd in suggested_words:
    if wrd in word_probability.keys():
      output.append([wrd,word_probability[wrd]])

  return list(pd.DataFrame(output,columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)

In [47]:
spell_check_edit_1('lover',word_probability,10)

['over',
 'love',
 'lower',
 'lover',
 'loved',
 'cover',
 'loves',
 'rover',
 'hover',
 'liver']

### 5. Finding the prediction (Level - 2)
#### 5.1) Combining Possible Words

In [56]:
def spell_check_edit_2(word,word_probability,count=5):
  output = []
  suggested_words = edit(word) # first level edit
  for e1 in edit(word):
    suggested_words += edit(e1) # second level edit
  suggested_words = set(suggested_words)
  for wrd in suggested_words:
    if wrd in word_probability.keys():
      output.append([wrd,word_probability[wrd]])
  return list(pd.DataFrame(output,columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)

In [59]:
print(spell_check_edit_1('fameli',word_probability))
spell_check_edit_2('fameli',word_probability)

[]


['family', 'namely', 'fame', 'lamely']