In [None]:
import pandas as pd
data = pd.read_csv("dataset.csv")


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenize the words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['correct_words'])

# convert words to numerical inputs
correct_words = tokenizer.texts_to_sequences(data['correct_words'])
misspelled_words = tokenizer.texts_to_sequences(data['misspelled_words'])


In [None]:
from keras.layers import Embedding, LSTM, Dense, Input, TimeDistributed
from keras.models import Model

# Define the input layer
input_layer = Input(shape=(None,))

# Define the embedding layer
embedding_layer = Embedding(len(tokenizer.word_index)+1, 100, input_length=None)(input_layer)

# Define the LSTM layer
lstm_layer = LSTM(100, return_sequences=True)(embedding_layer)

# Define the output layer
output_layer = TimeDistributed(Dense(len(tokenizer.word_index)+1, activation='softmax'))(lstm_layer)

# Create the model
model = Model(input_layer, output_layer)


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.fit(correct_words, misspelled_words)


In [None]:
# Encode the misspelled words
misspelled_words = tokenizer.texts_to_sequences(["misspelled_word"])

# Predict the corrected word
predicted_word = model.predict(misspelled_words)

# Decode the predicted word
predicted_word = tokenizer.sequences_to_texts([predicted_word])


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

# Tokenize the correct and misspelled words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(data['correct_words'].values) + list(data['misspelled_words'].values))

# Convert the words to sequences of integers
correct_sequences = tokenizer.texts_to_sequences(data['correct_words'].values)
misspelled_sequences = tokenizer.texts_to_sequences(data['misspelled_words'].values)

# Pad the sequences to the same length
max_length = max(len(max(correct_sequences, key=len)), len(max(misspelled_sequences, key=len)))
correct_sequences = pad_sequences(correct_sequences, maxlen=max_length)
misspelled_sequences = pad_sequences(misspelled_sequences, maxlen=max_length)

# Define the encoder and decoder inputs
encoder_inputs = Input(shape=(max_length,))
decoder_inputs = Input(shape=(max_length,))

# Use an embedding layer to map the words to a dense vector representation
embedding_layer = Embedding(len(tokenizer.word_index) + 1, 100)
encoder_embedding = embedding_layer(encoder_inputs)
decoder_embedding = embedding_layer(decoder_inputs)

# Use an LSTM layer as the encoder
encoder = LSTM(100, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embedding)
encoder_states = [state_h, state_c]

# Use an LSTM layer as the decoder
decoder = LSTM(100, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder(decoder_embedding, initial_state=encoder_states)

# Use a dense layer to map the decoder output to the vocabulary
dense = Dense(len(tokenizer.word_index) + 1, activation='softmax')
outputs = dense(decoder_outputs)

# Define and compile the model
model = Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Fit the model on the data
model.fit([correct_sequences, misspelled_sequences], correct_sequences,
          batch_size=32, epochs=100, validation_split=0.2)
