In [1]:
import gensim

# Load pre-trained Word2Vec model.
model_path = 'word2vec.model'
model = gensim.models.Word2Vec.load(model_path)

In [5]:
vector = model.wv['nothing']

In [6]:
# Find the closest words to the given vector
def find_closest_words(vector, n=5):
    return model.wv.similar_by_vector(vector, topn=n)

# Get the closest words to the 'alice' vector
closest_words = find_closest_words(vector)
print(closest_words)

[('nothing', 0.9999999403953552), ('calmly', 0.4815042018890381), ('no', 0.4803059995174408), ('doanything', 0.4499460458755493), ('none', 0.44889116287231445)]


In [23]:
import difflib

word_list = ['alice','lic', 'lice', 'ice', 'malice', 'slice', 'bicycle', 'reptile']

def find_similar_words(word, word_list, n=5):
    similarity_scores = [(w, difflib.SequenceMatcher(None, word, w).ratio()) for w in word_list]
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    return [w[0] for w in similarity_scores[:n]]

input_word = 'alc'
similar_words = find_similar_words(input_word, word_list)
print(similar_words)


['alice', 'lic', 'malice', 'lice', 'slice']


In [2]:
from gensim.models import Word2Vec

# Create character n-grams from a list of words
def create_ngrams(word_list, n=3):
    ngrams = []
    for word in word_list:
        for i in range(len(word) - n + 1):
            ngrams.append(word[i:i+n])
    return ngrams

word_list = ['alice', 'malice', 'slice', 'bicycle', 'reptile']
ngrams = create_ngrams(word_list)

# Train a Word2Vec model with character n-grams
model = Word2Vec(sentences=[ngrams], min_count=1, sg=0, vector_size=100, window=2)

# Find similar n-grams to the input n-gram
def find_similar_ngrams(ngram, n=5):
    return model.wv.similar_by_word(ngram, topn=n)

input_ngram = 'ice'
similar_ngrams = find_similar_ngrams(input_ngram)
print(similar_ngrams)


[('ycl', 0.21617253124713898), ('bic', 0.09310110658407211), ('rep', 0.09289432317018509), ('icy', 0.07958736270666122), ('cyc', 0.06282659620046616)]


In [4]:
import difflib
from gensim.models import Word2Vec

def find_similar_words(word, word_list, n=5):
    similarity_scores = [(w, difflib.SequenceMatcher(None, word, w).ratio()) for w in word_list]
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    return [w[0] for w in similarity_scores[:n]]

def find_closest_word_from_ngrams(input_ngram, word_list, model, n=5):
    similar_ngrams = find_similar_ngrams(input_ngram, n)
    similar_ngrams = [ngram for ngram, score in similar_ngrams]
    similar_words = find_similar_words(input_ngram, word_list + similar_ngrams)
    return similar_words

word_list = ['alice', 'malice', 'slice', 'bicycle', 'reptile']
ngrams = create_ngrams(word_list)


input_ngram = 'ice'
similar_words = find_closest_word_from_ngrams(input_ngram, word_list, model)
print(similar_words)


['alice', 'slice', 'malice', 'bic', 'icy']


In [7]:
import gensim
import difflib

# Load the CBOW Word2Vec model using Gensim
model = gensim.models.Word2Vec.load('word2vec.model')

# Define a function that returns the most similar word based on Word2Vec embeddings
def find_similar_word_word2vec(word, word_list, model):
    try:
        # Get the Word2Vec embedding for the input word
        word_embedding = model.wv[word]
        # Find the most similar word in the word list based on the Word2Vec embeddings
        most_similar_word = model.wv.most_similar([word_embedding], topn=1)[0][0]
        return most_similar_word
    except KeyError:
        # Return the input word if it's not in the Word2Vec model
        return word

# Define a function that returns the most similar word based on similarity scores from difflib
def find_similar_word_difflib(word, word_list):
    # Calculate similarity scores between input word and words in word list using difflib
    similarity_scores = [(w, difflib.SequenceMatcher(None, word, w).ratio()) for w in word_list]
    # Sort similarity scores in descending order
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    # Return the most similar word
    return similarity_scores[0][0]

# Define a main function that combines the results from the two functions above
def autocorrect_word(word, word_list, model):
    # First, try to find a similar word based on Word2Vec embeddings
    most_similar_word_word2vec = find_similar_word_word2vec(word, word_list, model)
    # If the Word2Vec model doesn't have a similar word, try to find a similar word using difflib
    if most_similar_word_word2vec == word:
        most_similar_word_difflib = find_similar_word_difflib(word, word_list)
        return most_similar_word_difflib
    # If the Word2Vec model has a similar word, check if it's also similar according to difflib
    else:
        similarity_score = difflib.SequenceMatcher(None, word, most_similar_word_word2vec).ratio()
        if similarity_score > 0.8:
            return most_similar_word_word2vec
        else:
            most_similar_word_difflib = find_similar_word_difflib(word, word_list)
            return most_similar_word_difflib


In [8]:
word_list = ['alice', 'malice', 'slice', 'bicycle', 'reptile']

# Autocorrect the word 'alc'
input_word = 'alc'
corrected_word = autocorrect_word(input_word, word_list, model)
print(corrected_word) # Output: 'alice'


alice
