In [3]:
import gensim
import difflib

In [4]:
# Load the CBOW Word2Vec model using Gensim
model = gensim.models.Word2Vec.load('word2vec.model')

In [5]:
# Define a function that returns the most similar word based on Word2Vec embeddings
def find_similar_word_word2vec(word, word_list, model):
    try:
        # Get the Word2Vec embedding for the input word
        word_embedding = model.wv[word]
        # Find the most similar word in the word list based on the Word2Vec embeddings
        most_similar_word = model.wv.most_similar([word_embedding], topn=3)[0][0]
        return most_similar_word
    except KeyError:
        # Return the input word if it's not in the Word2Vec model
        return word

In [6]:
# Define a function that returns the most similar word based on similarity scores from difflib
def find_similar_word_difflib(word, word_list):
    # Calculate similarity scores between input word and words in word list using difflib
    similarity_scores = [(w, difflib.SequenceMatcher(None, word, w).ratio()) for w in word_list]
    # Sort similarity scores in descending order
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    # Return the most similar word
    return similarity_scores[0][0]

In [7]:
# Define a main function that combines the results from the two functions above
def autocorrect_word(word, word_list, model):
    # First, try to find a similar word based on Word2Vec embeddings
    most_similar_word_word2vec = find_similar_word_word2vec(word, word_list, model)
    # If the Word2Vec model doesn't have a similar word, try to find a similar word using difflib
    if most_similar_word_word2vec == word:
        most_similar_word_difflib = find_similar_word_difflib(word, word_list)
        return most_similar_word_difflib
    # If the Word2Vec model has a similar word, check if it's also similar according to difflib
    else:
        similarity_score = difflib.SequenceMatcher(None, word, most_similar_word_word2vec).ratio()
        if similarity_score > 0.8:
            return most_similar_word_word2vec
        else:
            most_similar_word_difflib = find_similar_word_difflib(word, word_list)
            return most_similar_word_difflib

In [8]:
word_list = model.wv.key_to_index.keys()

# Autocorrect the word 'alc'
input_word = 'ie'
corrected_word = autocorrect_word(input_word, word_list, model)
print(corrected_word) # Output: 'alice'

pie


In [9]:
input_sentence = "Alice has a ie"

In [10]:
def correct_the_spelling_mistakes(input_sentence, model):
    word_list = model.wv.key_to_index.keys()
    corrected_sentence = ""
    for word in input_sentence.split():
        if word not in word_list:
            corrected_word = autocorrect_word(word, word_list, model)
            corrected_sentence += " " + corrected_word
        else:
            corrected_sentence += " " + word
    return corrected_sentence.strip()

In [11]:
correct_the_spelling_mistakes(input_sentence, model)

'Alice has a pie'