In [24]:
import re
from tqdm import tqdm
import pandas as pd

# **Finding Unique words** -

In [5]:
with open('big.txt','r')as file:
  lines = file.readlines()
  words = []
  for line in lines:
    words += re.findall('\w+',line.lower())

print(len(words))
vocab = list(set(words))
print(len(vocab))

1115585
32198


# 2. **Finding the probability distribution**

In [6]:
word_probability = {}
print("RUNNING")
for word in tqdm(vocab):

  word_probability[word] = float(words.count(word)/len(words))

print('finished!')

RUNNING


100%|██████████| 32198/32198 [11:44<00:00, 45.72it/s]

finished!





In [7]:
len(word_probability)

32198



---



# 3. **Text Preproocessing**

3.1) **Splitting**

'loave' -> 'love'

In [8]:
def split(word):
  parts = []
  for i in range(len(word)+1):
    parts+= [(word[:i], word[i:])]
  return parts
print(split('lvoe'))

[('', 'lvoe'), ('l', 'voe'), ('lv', 'oe'), ('lvo', 'e'), ('lvoe', '')]


3.2 **Delete**

In [9]:
def delete(word):
  output = []
  for l,r in split(word):
    output.append(l+r[1:])
  return output
print(delete('loave'))

['oave', 'lave', 'love', 'loae', 'loav', 'loave']


3.3 **Swap**

In [10]:
# lvoe
def swap(word):
  output = []
  for l,r in split(word):
    if len(r) > 1:
      output.append(l + r[1] + r[0] + r[2:])

  return output

swap('lvoe')

['vloe', 'love', 'lveo']

3.4 **Replace**

In [11]:
characters = 'abcdefghijklmnopqrstuvwxyz'
def replace(word):
  output = []
  for l,r in split(word):
    for char in characters:
      output.append(l + char + r[1:])
  return output

len(replace('lave')) #will try all characters on all indexes

130

3.5 **Insert**

In [12]:
def insert(word):
  output = []
  for l,r in split(word):
    for char in characters:
      output.append(l+char+r)
  return output

response = insert('piza')
'pizza' in response

True

# 4. Finding the Prediction (Level - 1)

4.1) **Combining Possible Words**

In [46]:
def edit(word):
  return list(set(insert(word)+delete(word)+swap(word)+replace(word)))

response = edit('cresh')

4.2) **Predicting the Word**

In [36]:
def spell_check(word,count=5):
  output = []

  suggested_words = edit(word)

  for wrd in suggested_words:
    if wrd in word_probability.keys():
      output.append([wrd, word_probability[wrd]])

  return list(pd.DataFrame(output, columns=['word','prob']).sort_values(by='prob', ascending=False).head(count)['word'].values)
    # print(word, word_probability[word])

In [49]:
# pd.DataFrame(output, columns=['word','prob']).sort_values(by='prob', ascending=False)
spell_check('loave')

['love', 'leave']

# 5. Finding the Prediction (Level - 2)

5.1) **Combining Possible Words**

In [53]:
def spell_check2(word, count=5):
  output = []
  suggested_words = edit(word)  #level one edit

  for e1 in edit(word):
    suggested_words += edit(e1) #level 2 edit

  suggested_words = list(set(suggested_words))

  for wrd in suggested_words:
    if wrd in word_probability.keys():
      output.append([wrd, word_probability[wrd]])

  return list(pd.DataFrame(output, columns=['word','prob']).sort_values(by='prob', ascending=False).head(count)['word'].values)


In [54]:
spell_check2('loave')

['have', 'love', 'gave', 'leave', 'late']