In [27]:
import re
import json

def tokenize_line(text):
    return re.findall(r'[\u0900-\u097F]+|[^\s\w]', text, re.UNICODE)

data = []
with open("data/scraped_all.json", "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            data.append(json.loads(line))

tokens = []
for entry in data:
    for line in entry.get("lines", []):
        if line.strip():
            tokenss = tokenize_line(line)
            for token in tokenss:
                tokens.append(token)
print(tokens[:200])




['जीकर', 'भी', 'जी', 'न', 'सके', ',', 'हम', 'तो', 'मर', 'न', 'सके', ',', 'ज़िंदगी', 'से', 'हम', 'प्यार', 'कर', 'न', 'सकें', ',', 'चाहा', 'जिसको', 'हमने', 'वो', ',', 'मिला', 'ही', 'नही', ',', ',', 'ज़िंदगी', 'से', 'हमें', 'ये', 'गिला', 'भी', 'नहीं', ',', 'प्यार', 'क्यूं', 'अब', 'एक', 'सज़ा', 'बन', 'गया', ',', 'रह', 'रह', 'कर', 'ये', 'इक', 'सजा', 'बन', 'गया।', 'हो', 'खुशी', 'चाहें', 'गम', ',', 'सब', 'एक', 'ही', 'मौसम', ',', 'हमनें', 'तो', 'दर्द', 'से', 'नाता', 'ये', 'जोड़', 'लिया', ',', 'ये', 'रिश्ते', 'प्यार', 'के', 'अहसास', 'से', 'बनते', 'हैं', ',', 'लेकिन', 'लोगों', 'ने', 'इन्हे', 'व्यापार', 'समझ', 'लिया।', 'कभी', 'गुब्बारे', 'सा', 'फूट', 'गया', 'था', 'जो', 'मेरा', 'दिल', 'वो', 'अब', 'सुकून', 'पा', 'रहा', 'है', 'कुछ', 'समय', 'बाद', 'ही', 'सही', 'पर', 'तुम्हारा', 'फोन', 'आ', 'रहा', 'है', 'इस', 'खुशी', 'का', 'कैसे', 'जश्न', 'मनाऊं', 'मैं', 'तुम्हारा', 'नाम', 'लेकर', 'ही', 'सुकून', 'आ', 'रहा', 'है', 'आसान', 'नहीं', 'है', 'रास्ता', 'नाम', 'कमाने', 'के', 'लिए', '|', 'अभ्यास', 'करना', 'पड़त

In [28]:
vocab = sorted(set(tokens))
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

print(f"Vocab size: {len(vocab)}")


Vocab size: 7550


In [29]:
seq_length = 30
inputs = []
targets = []

for i in range(len(tokens) - seq_length):
    seq = tokens[i:i+seq_length]          # input words
    target = tokens[i+seq_length]         # next word to predict
    inputs.append([word2idx[w] for w in seq])
    targets.append(word2idx[target])

import torch

X = torch.tensor(inputs) 
y = torch.tensor(targets) 

dataset_size = len(X)
split_ratio = 0.8
split_idx = int(dataset_size * split_ratio)

X_train = X[:split_idx]
y_train = y[:split_idx]

X_test = X[split_idx:]
y_test = y[split_idx:]

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 32394, Test size: 8099


In [30]:
import torch.nn as nn

class NextWordRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)   
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=2, batch_first=True, ) 
        self.fc = nn.Linear(hidden_size, vocab_size)         

    def forward(self, x):
        x = self.embedding(x)    
        out, _ = self.lstm(x)     
        out = out[:, -1, :]   
        out = self.fc(out)
        return out


In [35]:
vocab_size = len(vocab)
embed_size = 64
hidden_size = 128
model = NextWordRNN(vocab_size, embed_size, hidden_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)
X = X.to(device)
y = y.to(device)

X_test = X_test.to(device)
y_test = y_test.to(device)

num_epochs = 250
batch_size = 128

for epoch in range(num_epochs):
    # --- Training ---
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for i in range(0, len(X), batch_size):
        xb = X[i:i+batch_size].to(device)
        yb = y[i:i+batch_size].to(device)

        optimizer.zero_grad()
        outputs = model(xb)
        loss = criterion(outputs, yb)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs, dim=1) 
        correct += (predicted == yb).sum().item()
        total += yb.size(0)

    avg_train_loss = epoch_loss / (len(X) // batch_size)
    train_accuracy = correct / total * 100
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")

    # --- Testing / Validation ---
    model.eval()
    test_loss = 0
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for i in range(0, len(X_test), batch_size):
            xb = X_test[i:i+batch_size].to(device)
            yb = y_test[i:i+batch_size].to(device)
            outputs = model(xb)
            loss = criterion(outputs, yb)
            test_loss += loss.item()

            _, predicted = torch.max(outputs, dim=1)
            correct_test += (predicted == yb).sum().item()
            total_test += yb.size(0)

    avg_test_loss = test_loss / (len(X_test) // batch_size)
    test_accuracy = correct_test / total_test * 100
    print(f"Epoch {epoch+1}/{num_epochs}, Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")


Using device: cuda
Epoch 1/250, Train Loss: 7.6117, Train Accuracy: 4.18%
Epoch 1/250, Test Loss: 7.2890, Test Accuracy: 4.54%
Epoch 2/250, Train Loss: 7.0857, Train Accuracy: 5.16%
Epoch 2/250, Test Loss: 6.9350, Test Accuracy: 6.07%
Epoch 3/250, Train Loss: 6.9364, Train Accuracy: 5.40%
Epoch 3/250, Test Loss: 6.8112, Test Accuracy: 6.06%
Epoch 4/250, Train Loss: 6.8118, Train Accuracy: 5.69%
Epoch 4/250, Test Loss: 6.6949, Test Accuracy: 6.17%
Epoch 5/250, Train Loss: 6.6984, Train Accuracy: 5.95%
Epoch 5/250, Test Loss: 6.6021, Test Accuracy: 6.56%
Epoch 6/250, Train Loss: 6.5902, Train Accuracy: 6.01%
Epoch 6/250, Test Loss: 6.4984, Test Accuracy: 6.77%
Epoch 7/250, Train Loss: 6.4967, Train Accuracy: 6.15%
Epoch 7/250, Test Loss: 6.5233, Test Accuracy: 6.79%
Epoch 8/250, Train Loss: 6.4039, Train Accuracy: 6.26%
Epoch 8/250, Test Loss: 6.4216, Test Accuracy: 6.83%
Epoch 9/250, Train Loss: 6.2969, Train Accuracy: 6.55%
Epoch 9/250, Test Loss: 6.3750, Test Accuracy: 6.94%
Epoch 10/

In [37]:
torch.save(model.state_dict(), "next_word.pth")
import pickle
with open("word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)
with open("idx2word.pkl", "wb") as f:
    pickle.dump(idx2word, f)


In [None]:
def predict_next_word(model, prompt, k=3):
    model.eval()
    prompt_tokens = re.findall(r'[\u0900-\u097F]+|[^\s\w]', prompt, re.UNICODE)
    input_seq = prompt_tokens[-seq_length:]
    input_ids = [word2idx.get(w, 0) for w in input_seq]
    input_tensor = torch.tensor([input_ids]).to(next(model.parameters()).device) 

    with torch.no_grad():
        output = model(input_tensor)                
        probs = torch.softmax(output, dim=1)         # probabilities
        top_probs, top_indices = torch.topk(probs, k)

        predictions = [idx2word[idx.item()] for idx in top_indices[0]]
        return predictions

