In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input,Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.utils import to_categorical

In [None]:
# Load the data
data = pd.read_excel('/content/hindidata.xlsx')

In [None]:
# Preprocess the data
corpus = data['Hindi'].tolist()
corpus = [str(x) for x in corpus]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
max_sequence_length = max([len(x) for x in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length+1, padding='post')
vocab_size = len(tokenizer.word_index) + 1


In [None]:
# Define the LSTM model
inputs = Input(shape=(max_sequence_length+1,))
x = Embedding(vocab_size, 100)(inputs)
x = LSTM(100)(x)
outputs = Dense(vocab_size, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 26)]              0         
                                                                 
 embedding (Embedding)       (None, 26, 100)           300800    
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 3008)              303808    
                                                                 
Total params: 685,008
Trainable params: 685,008
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Train the LSTM model
for i in range(10):
    model.fit(padded_sequences, to_categorical(padded_sequences[:, i+1], num_classes=vocab_size),
              epochs=1, verbose=1, batch_size=32)



In [None]:
# Define the word recommender function
def get_word_recommendations(sequence):
    sequence = sequence.split()
    sequence = [tokenizer.word_index.get(word, 0) for word in sequence]
    sequence = pad_sequences([sequence], maxlen=max_sequence_length+1, padding='post')
    predicted_probs = model.predict(sequence)[0]
    predicted_indices = np.argsort(predicted_probs)[::-1][:1]

    return [tokenizer.index_word[index+1] for index in predicted_indices]



In [None]:
# # Example usage
word_recommendations = get_word_recommendations('')
print(word_recommendations)

['है।']
