<a href="https://colab.research.google.com/github/Daksh024/NSP/blob/Colab/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import numpy as np

with open("/content/sample_data/filenew.txt", "r", encoding="utf-8") as file:
    dataset = [line.strip() for line in file]

# Tokenize the dataset and create a vocabulary
words = [sentence.split() for sentence in dataset]
word_list = np.unique(np.concatenate(words))
word2idx = {w: i for i, w in enumerate(word_list)}
idx2word = {i: w for i, w in enumerate(word_list)}

# Convert sentences to sequences of word indices
sequences = [[word2idx[w] for w in sentence.split()] for sentence in dataset]

# Padding sequences to have the same length
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = [seq + [0] * (max_seq_length - len(seq)) for seq in sequences]

# Convert to PyTorch tensors
inputs = torch.tensor(padded_sequences, dtype=torch.long)

# LSTM-based model for next word prediction
class NextWordPredictionModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(NextWordPredictionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        output = self.fc(lstm_out)
        return output

# Model hyperparameters
vocab_size = len(word2idx)
embedding_dim = 100
hidden_dim = 128

# Create model instance
model = NextWordPredictionModel(vocab_size, embedding_dim, hidden_dim)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(input_tensors)
    loss = criterion(outputs.view(-1, vocab_size), output_tensors.view(-1))
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")



Epoch [1/100], Loss: 8.2018
Epoch [2/100], Loss: 8.1117
Epoch [3/100], Loss: 8.0213
Epoch [4/100], Loss: 7.9292
Epoch [5/100], Loss: 7.8338
Epoch [6/100], Loss: 7.7338
Epoch [7/100], Loss: 7.6277
Epoch [8/100], Loss: 7.5141
Epoch [9/100], Loss: 7.3919
Epoch [10/100], Loss: 7.2599
Epoch [11/100], Loss: 7.1170
Epoch [12/100], Loss: 6.9623
Epoch [13/100], Loss: 6.7948
Epoch [14/100], Loss: 6.6140
Epoch [15/100], Loss: 6.4194
Epoch [16/100], Loss: 6.2110
Epoch [17/100], Loss: 5.9889
Epoch [18/100], Loss: 5.7540
Epoch [19/100], Loss: 5.5075
Epoch [20/100], Loss: 5.2514
Epoch [21/100], Loss: 4.9883
Epoch [22/100], Loss: 4.7220
Epoch [23/100], Loss: 4.4566
Epoch [24/100], Loss: 4.1969
Epoch [25/100], Loss: 3.9473
Epoch [26/100], Loss: 3.7112
Epoch [27/100], Loss: 3.4904
Epoch [28/100], Loss: 3.2851
Epoch [29/100], Loss: 3.0946
Epoch [30/100], Loss: 2.9175
Epoch [31/100], Loss: 2.7526
Epoch [32/100], Loss: 2.5984
Epoch [33/100], Loss: 2.4539
Epoch [34/100], Loss: 2.3177
Epoch [35/100], Loss: 2

In [None]:
# Inference
sample_input = torch.tensor([word2idx["रंग"]], dtype=torch.long).unsqueeze(0)
max_seq_length = 3  # You can adjust this based on your requirements

with torch.no_grad():
    model.eval()
    for _ in range(max_seq_length):
        predicted_indices = model(sample_input).argmax(dim=-1)
        last_predicted_index = predicted_indices[:, -1]
        sample_input = torch.cat((sample_input, last_predicted_index.unsqueeze(0)), dim=1)

predicted_indices = sample_input.squeeze().tolist()
predicted_words = [idx2word[idx] for idx in predicted_indices]
print("Generated sentence:", ' '.join(predicted_words))

Generated sentence: रंग तेजस्‍वी 'चलो, -
