## With the context of machine learning, autocorrect is based on natural language processing. As the name suggests it is programmed to correct spellings and errors while typing.

In [17]:
#Importing some necessary libraries
import pandas as pd
import numpy as np
import textdistance
import re
from collections import Counter

#### Like our smartphone uses history to match the type words whether it’s correct or not. So here we also need to use some words to put the functionality in our autocorrect.

In [28]:
words = []

with open("moby.txt",encoding="utf8") as f:
    file_name_data = f.read()
    file_name_data = file_name_data.lower()
    words = re.findall("\w+", file_name_data)
    
# This is our vocabulary
V = set(words)
print(f"The first ten words in the text are: \n{words[0:10]}")
print(f"There are {len(V)} unique words in the vocabulary")

The first ten words in the text are: 
['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale']
There are 17354 unique words in the vocabulary


#### In the above code, we made a list of words, and now we need to build the frequency of those words, which can be easily done by using the counter function in Python:

In [29]:
word_freq_dict = {}
word_freq_dict = Counter(words)
print(word_freq_dict.most_common()[0:10])


[('the', 14718), ('of', 6743), ('and', 6518), ('a', 4807), ('to', 4707), ('in', 4242), ('that', 3100), ('it', 2536), ('his', 2532), ('i', 2127)]


## Relative Frequency of words

#### Now we want to get the probability of occurence of each word, this equals the relative frequencies of the worlds:


In [32]:
probs = {}
Total = sum(word_freq_dict.values())
for k in word_freq_dict.keys():
    probs[k] = word_freq_dict[k]/Total

## Finding Similar Words

#### We will sort similar words according to the jaccard distance by calculating the 2 grams Q of the words. Next we will return the 5 most similar words order by similarity and probability

In [50]:
def my_autocorrect(input_word):
    input_word = input_word.lower()
    if input_word in V:
        return 'Your word seems to be correct'
    else:
        similarities = [1 - (textdistance.Jaccard(qval=2).distance(v, input_word)) for v in word_freq_dict.keys()]
        df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
        df = df.rename(columns={'index': 'Word', 0: 'Prob'})
        df['Similarity'] = similarities
        output = df.sort_values(['Similarity', 'Prob'], ascending=False).head()
        return output

# Now, let’s find the similar words by using our autocorrect function:



In [74]:
my_autocorrect("learnig")

Unnamed: 0,Word,Prob,Similarity
4842,learn,4e-05,0.666667
2301,learning,2.7e-05,0.625
5831,learnt,4e-06,0.571429
921,learned,0.000112,0.5
676,clear,0.000184,0.428571
