In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models.keyedvectors import KeyedVectors


In [None]:
data = pd.read_csv("dataset.csv")


In [None]:
# Load the GloVe model
glove_model = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', binary=False)

# Tokenize the words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['correct_words'])

# convert words to numerical inputs using GloVe word embeddings
correct_words = pad_sequences(tokenizer.texts_to_sequences(data['correct_words']))
misspelled_words = pad_sequences(tokenizer.texts_to_sequences(data['misspelled_words']))

# Create the embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index)+1, 100))
for word, i in tokenizer.word_index.items():
    if word in glove_model:
        embedding_matrix[i] = glove_model[word]


In [None]:
from keras.layers import Embedding, LSTM, Dense, Input, TimeDistributed
from keras.models import Model

# Define the input layer
input_layer = Input(shape=(None,))

# Define the embedding layer
embedding_layer = Embedding(len(tokenizer.word_index)+1, 100, input_length=None,weights=[embedding_matrix], trainable=False)(input_layer)

# Define the LSTM layer
lstm_layer = LSTM(100, return_sequences=True)(embedding_layer)

# Define the output layer
output_layer = TimeDistributed(Dense(len(tokenizer.word_index)+1, activation='softmax'))(lstm_layer)

# Create the model
model = Model(input_layer, output_layer)


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.fit(correct_words, misspelled_words)


In [None]:
# Encode the misspelled words
misspelled_words = pad_sequences(tokenizer.texts_to_sequences(["misspelled_word"])
# Predict the corrected word
predicted_word = model.predict(misspelled_words)

# Decode the predicted word
predicted_word = tokenizer.sequences_to_texts([predicted_word])