In [1]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import gensim

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from spellchecker import SpellChecker
import Levenshtein


In [3]:
# # Load the GloVe word embeddings
# def load_embeddings(embedding_file):
#     embeddings_index = {}
#     with open(embedding_file, encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             coefs = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = coefs
#     return embeddings_index

In [4]:
# # Define a function to predict the next word given a context
# def predict_next_word(context):
#     context_words = context.split()
#     context_embedding = np.zeros((len(embeddings_index[next(iter(embeddings_index))]),))
#     for word in context_words:
#         if word in embeddings_index:
#             context_embedding += embeddings_index[word]
#     context_embedding /= len(context_words)
    
#     similarities = {}
#     for word in embeddings_index.keys():
#         if word not in context_words:
#             word_embedding = embeddings_index[word]
#             sim = cosine_similarity([context_embedding], [word_embedding])[0][0]
#             similarities[word] = sim
            
#     top_3_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:3]
#     return [w[0] for w in top_3_words]


In [5]:
# Load GloVe word vectors
def load_word_vectors(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
    return words, word_to_vec_map

In [6]:
words, word_to_vec_map = load_word_vectors("glove.6B.100d.txt")

In [7]:
def spell_check(text, vocab):
    list_of_words = text.split()
    misspelled_words = []
    for word in list_of_words:
        if word not in vocab:
            misspelled_words.append(word)
    return misspelled_words    
     

In [12]:
def correct_words(misspelled_words):
    spell = SpellChecker()
    list_of_candidate_words = []
    for word in misspelled_words:
        candidate_words = spell.candidates(word)
        list_of_candidate_words.append([word, candidate_words])
    return list_of_candidate_words

In [13]:
def choose_the_best_candidate(list_of_candidate_words):
    distances = []
    correct_word = []
    for item in list_of_candidate_words:
        for cadidate in item[1]:
            distances.append(Levenshtein.distance(item[0], cadidate))
        max_value = max(distances)
        max_index = distances.index(max_value)
        correct_word.append([item[0], list(item[1])[max_index]])
    return correct_word

In [18]:
text = "you aree forsoken in the laaand beyon the fog"
misspelled_words = spell_check(text=text, vocab=words)
misspelled_words

['forsoken', 'laaand', 'beyon']

In [19]:
list_of_candidate_words = correct_words(misspelled_words=misspelled_words)
list_of_candidate_words

[['forsoken', {'forsaken'}],
 ['laaand',
  {'ahand',
   'anand',
   'baaad',
   'carand',
   'dayand',
   'laaa',
   'lagaan',
   'lamond',
   'land',
   'lazard',
   'leland',
   'lnland',
   'lsland'}],
 ['beyon', {'bayon', 'belon', 'beon', 'beton', 'beyond', 'beyong'}]]

In [20]:
correct_word = choose_the_best_candidate(list_of_candidate_words=list_of_candidate_words)
correct_word

[['forsoken', 'forsaken'], ['laaand', 'leland'], ['beyon', 'beton']]

In [22]:
spell = SpellChecker()
spell.correction("beyon")

'beyond'