In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras import backend as K
from sklearn.metrics.pairwise import cosine_similarity



# Step 1: Train the CBOW model on a large corpus of text to generate word embeddings.
corpus = ["the cat sat on the mat", "the dog ran in the park", "the bird flew in the sky"]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1

embedding_dim = 50
window_size = 2

def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for sentence in corpus:
        sentence = tokenizer.texts_to_sequences([sentence])[0]
        for i, word in enumerate(sentence):
            context_words = []
            label_word = []
            start = i - window_size
            end = i + window_size + 1
            context_words.append([sentence[idx] for idx in range(start, end) if 0 <= idx < len(sentence) and idx != i])
            label_word.append(word)
            x = pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)



cbow = cbow_model(vocab_size, embedding_dim, window_size)
for epoch in range(1, 6):
    loss = 0
    i = 0
    for x, y in generate_context_word_pairs(corpus, window_size, vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)[0]
        if i % 1000 == 0:
            print('Processed %s context/label pairs' % i)
    print('Epoch:', epoch, '\tLoss:', loss)

# Step 2: Generate word embeddings for the input text by averaging the embeddings of the surrounding words.
input_text = "the ct sat on the mat"
input_sequence = tokenizer.texts_to_sequences([input_text])[0]
context_words = []
for i, word in enumerate(input_sequence):
    context_words.append([input_sequence[idx] for idx in range(i-window_size, i+window_size+1) if 0 <= idx < len(input_sequence) and idx != i])
context_words = np.array(context_words)
input_embedding = np.mean(cbow.layers[0].get_weights()[0][context_words], axis=1)

# Step 3: Compute the cosine similarity between the input word and all the words in the vocabulary using the generated embeddings.
cosine_similarities = cosine_similarity(input_embedding.reshape(1, -1), cbow.layers[0].get_weights()[0])

# Step 4: Select the word(s) with the highest cosine similarity as the most likely candidates for autocorrection.
top_indices = np.argsort(cosine_similarities[0])[:-4:-1]

for i, index in enumerate(top_indices):
    if i == 0:
        print(f"Autocorrect suggestions for '{word}':")
    print(f"{reverse_word_index[index+1]} ({cosine_similarities[0][index]:.3f})")



NameError: name 'np_utils' is not defined