In [1]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import gensim

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from spellchecker import SpellChecker
import Levenshtein


In [3]:
# # Load the GloVe word embeddings
# def load_embeddings(embedding_file):
#     embeddings_index = {}
#     with open(embedding_file, encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             coefs = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = coefs
#     return embeddings_index

In [4]:
# # Define a function to predict the next word given a context
# def predict_next_word(context):
#     context_words = context.split()
#     context_embedding = np.zeros((len(embeddings_index[next(iter(embeddings_index))]),))
#     for word in context_words:
#         if word in embeddings_index:
#             context_embedding += embeddings_index[word]
#     context_embedding /= len(context_words)
    
#     similarities = {}
#     for word in embeddings_index.keys():
#         if word not in context_words:
#             word_embedding = embeddings_index[word]
#             sim = cosine_similarity([context_embedding], [word_embedding])[0][0]
#             similarities[word] = sim
            
#     top_3_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:3]
#     return [w[0] for w in top_3_words]


In [3]:
# Load GloVe word vectors
def load_word_vectors(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
    return words, word_to_vec_map

In [4]:
words, word_to_vec_map = load_word_vectors("glove.6B.100d.txt")

In [7]:
import nltk
from collections import Counter
from itertools import chain

# Step 1: Preprocess text data
text_corpus = ['This is a sample sentence.', 'Another sentence for the example.']
tokenized_corpus = [nltk.word_tokenize(sentence.lower()) for sentence in text_corpus]
tokens = list(chain(*tokenized_corpus))

# Step 2: Build frequency distribution of words
word_frequencies = Counter(tokens)

# Step 3: Filter the vocabulary
stopwords = set(nltk.corpus.stopwords.words('english'))
vocabulary = [word for word in word_frequencies if word not in stopwords and word_frequencies[word] > 1]

# Step 4: Assign indices to words
word_to_index = {word: i for i, word in enumerate(vocabulary)}

# Step 5: Create word embeddings
words, word_to_vec_map = load_word_vectors('glove.6B.100d.txt')
embedding_matrix = np.zeros((len(vocabulary), 100))
for word, i in word_to_index.items():
    if word in words:
        embedding_matrix[i] = word_to_vec_map[word]


# Tokenize input text
input_text = 'Ths is a sntnce wth sme typoos.'
tokens = nltk.word_tokenize(input_text.lower())

# Identify misspelled words
misspelled_words = []
for token in tokens:
    if token not in words:
        misspelled_words.append(token)

print('Input text:', input_text)
print('Tokens:', tokens)
print('Misspelled words:', misspelled_words)


In [10]:
word_frequencies

Counter({'this': 1,
         'is': 1,
         'a': 1,
         'sample': 1,
         'sentence': 2,
         '.': 2,
         'another': 1,
         'for': 1,
         'the': 1,
         'example': 1})

In [7]:
def spell_check(text, vocab):
    list_of_words = text.split()
    misspelled_words = []
    for word in list_of_words:
        if word not in vocab:
            misspelled_words.append(word)
    return misspelled_words    
     

In [8]:
def correct_words(misspelled_words):
    spell = SpellChecker()
    list_of_candidate_words = []
    for word in misspelled_words:
        candidate_words = spell.candidates(word)
        list_of_candidate_words.append([word, candidate_words])
    return list_of_candidate_words

In [14]:
def choose_the_best_candidate(list_of_candidate_words):
    distances = []
    correct_word = []
    for item in list_of_candidate_words:
        for cadidate in item[1]:
            distances.append(Levenshtein.distance(item[0], cadidate))
        max_value = max(distances)
        max_index = distances.index(max_value)
        correct_word.append([item[0], list(item[1])[max_index]])
    return correct_word

In [15]:
text = "you aree forsoken in the laaand beyon the fog"
misspelled_words = spell_check(text=text, vocab=words)
misspelled_words

['forsoken', 'laaand', 'beyon']

In [16]:
list_of_candidate_words = correct_words(misspelled_words=misspelled_words)
list_of_candidate_words

[['forsoken', {'forsaken'}],
 ['laaand',
  {'ahand',
   'anand',
   'baaad',
   'carand',
   'dayand',
   'laaa',
   'lagaan',
   'lamond',
   'land',
   'lazard',
   'leland',
   'lnland',
   'lsland'}],
 ['beyon', {'bayon', 'belon', 'beon', 'beton', 'beyond', 'beyong'}]]

In [17]:
correct_word = choose_the_best_candidate(list_of_candidate_words=list_of_candidate_words)
correct_word

[['forsoken', 'forsaken'], ['laaand', 'dayand'], ['beyon', 'beton']]

In [18]:
spell = SpellChecker()
spell.correction("laaand")

'land'

In [23]:
"leland" in words

True

In [18]:
import random
import nltk

# Set the seed for reproducibility
random.seed(42)

# Load the NLTK English corpus
nltk.download('words')
words = set(nltk.corpus.words.words())

# Generate a list of random words
num_words = 20
random_words = []
while len(random_words) < num_words:
    # Generate a random word
    word = random.choice(list(words))
    # Check if the word is a single word and not a phrase
    if ' ' not in word:
        random_words.append(word.lower())

# Print the random words


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\basel\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [19]:
vocabulary = random_words
vocabulary

['untemptability',
 'refrigeration',
 'tangaridae',
 'chancrous',
 'excogitation',
 'diaphanoscopy',
 'olivinic',
 'thujopsis',
 'sharezer',
 'rimmaking',
 'shirttail',
 'achromatopia',
 'belay',
 'atomism',
 'fluoboride',
 'inbred',
 'chiococcine',
 'misspend',
 'inlay',
 'metanephritic']

In [35]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors


In [38]:

tmp_file = get_tmpfile("glove.6B.100d.txt")

tmp_file


'C:\\Users\\basel\\AppData\\Local\\Temp\\tmpz926yl2m\\glove.6B.100d.txt'

In [39]:
model = KeyedVectors.load_word2vec_format(tmp_file)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\basel\\AppData\\Local\\Temp\\tmpz926yl2m\\glove.6B.100d.txt'

In [29]:
import gensim

glove_model_path = 'glove.6B.100d.txt'
glove_model = gensim.models.KeyedVectors.load_word2vec_format(glove_model_path,encoding ='utf8')


ValueError: invalid literal for int() with base 10: 'the'

In [None]:
def generate_corrections(word, vocabulary, glove_model):
    # Get the word vector for the given word
    word_vector = glove_model[word]
    # Find the cosine similarity between the word vector and all other word vectors in the vocabulary
    cosine_similarities = {}
    for vocab_word in vocabulary:
        vocab_word_vector = glove_model[vocab_word]
        cosine_similarity = np.dot(word_vector, vocab_word_vector) / (np.linalg.norm(word_vector) * np.linalg.norm(vocab_word_vector))
        cosine_similarities[vocab_word] = cosine_similarity
    # Sort the candidate corrections by cosine similarity
    candidate_corrections = sorted(cosine_similarities.items(), key=lambda x: x[1], reverse=True)
    return candidate_corrections


In [None]:
def auto_correct(input_text, vocabulary, glove_model):
    # Tokenize the input text into words
    words = input_text.split()
    # Generate candidate corrections for each word
    corrected_words = []
    for word in words:
        # Check if the word is in the vocabulary
        if word in vocabulary:
            corrected_words.append(word)
        else:
            # Generate candidate corrections
            candidate_corrections = generate_corrections(word, vocabulary, glove_model)
            # Select the best correction
            best_correction = candidate_corrections[0][0]
            corrected_words.append(best_correction)
    # Join the corrected words back into a single string
    corrected_text = ' '.join(corrected_words)
    return corrected_text


In [None]:
input_text = "I wnt to the park to play futbol."
vocabulary = ['want', 'to', 'the', 'park', 'play', 'football']
corrected_text = auto_correct(input_text, vocabulary, glove_model)
print(corrected_text)
