In [1]:
!pip install nltk



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk

In [8]:
document = """Karma karo, phal ki chinta mat karo.
Jo tumhare control mein hai, wahi karo.
Jo control mein nahi, usse chhod do.
Intention sahi rakho, result apne aap sudhrega.
Har kaam bhagwan ko samarpit karke karo.
Kaam se bhaagna samasya ka hal nahi.
Dar se decision mat lo.
Dharma ke liye khade raho.
Galat ke saamne chup rehna bhi galat hai.
Apni zimmedari se mat bhago.
Har paristhiti temporary hoti hai.
Dukh bhi aata hai, chala bhi jaata hai.
Sukh bhi aata hai, chala bhi jaata hai.
Jo badalta hai, woh satya nahi hota.
Aatma kabhi marti nahi.
Shareer nashwar hai.
Aatma amar hai.
Jo ja raha hai, usse pakadne ki koshish mat karo.
Jo mil raha hai, uska abhaar rakho.
Laalach tumhe andha bana deta hai.
Ahankar tumhe akela kar deta hai.
Shanti bahar nahi, andar milti hai.
Khud se ladna sabse badi yudh hai.
Jeetne se zyada, sahi rehna zaroori hai.
Dusron se tulna dukh ka kaaran hai.
Apni yatra par dhyaan do.
Dusron ke raste mat copy karo.
Jo tum ho, wahi best hai.
Bhagya tumhare haath mein nahi, karm tumhare haath mein hai.
Aaj ka karm, kal ka bhagya banata hai.
Aalas tumhara sabse bada shatru hai.
Krodh tumhara vivek chheen leta hai.
Kaamna tumhe bandh deti hai.
Asakti tumhe kamzor banati hai.
Tyag tumhe shaktishaali banata hai.
Santulan jeevan ka mool mantra hai.
Zyada bhi theek nahi, kam bhi theek nahi.
Madhyam marg hi sahi marg hai.
Jo ho raha hai, usse samjho.
Jo nahi samajh aaye, use sweekar karo.
Bhagwan par bharosa rakho.
Khud par bhi bharosa rakho.
Dar tab aata hai jab bharosa kam hota hai.
Shraddha tumhe mazboot banati hai.
Man tumhara mitra bhi hai, shatru bhi.
Man ko shant karna seekho.
Dhyaan se man ko niyantrit karo.
Roz thoda samay khud ke liye nikaalo.
Shor se door rehkar sach sunai deta hai.
Chhoti khushiyon ko mehsoos karo.
Bade sapne rakho, par zameen par raho.
Jo mil gaya, us par ghamand mat karo.
Jo nahi mila, us par dukh mat karo.
Har vyakti apni yudh lad raha hai.
Sabke dard ka tumhe pata nahi.
Isliye daya rakho.
Isliye kshama rakho.
Jo tumne boya hai, wahi kaatoge.
Beej aaj daalo, phal kal milega.
Sabr rakho.
Samay sab sikhata hai.
Jo tumhe todta hai, wahi tumhe banata hai.
Kathinai tumhari pariksha hai.
Pariksha se hi paripakvata aati hai.
Paripakvata se hi shanti aati hai.
Shanti hi asli safalta hai.
Safalta sirf paisa nahi hoti.
Safalta man ki shanti hai.
Jahan shanti hai, wahi jeevan hai.
Jahan asha hai, wahi bhagwan hai
"""


In [9]:
# Tokenization
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
# tokenize
tokens = word_tokenize(document.lower())

In [11]:
# build vocab
vocab = {'<unk>':0}

for token in Counter(tokens).keys():
  if token not in vocab:
    vocab[token] = len(vocab)

vocab

{'<unk>': 0,
 'karma': 1,
 'karo': 2,
 ',': 3,
 'phal': 4,
 'ki': 5,
 'chinta': 6,
 'mat': 7,
 '.': 8,
 'jo': 9,
 'tumhare': 10,
 'control': 11,
 'mein': 12,
 'hai': 13,
 'wahi': 14,
 'nahi': 15,
 'usse': 16,
 'chhod': 17,
 'do': 18,
 'intention': 19,
 'sahi': 20,
 'rakho': 21,
 'result': 22,
 'apne': 23,
 'aap': 24,
 'sudhrega': 25,
 'har': 26,
 'kaam': 27,
 'bhagwan': 28,
 'ko': 29,
 'samarpit': 30,
 'karke': 31,
 'se': 32,
 'bhaagna': 33,
 'samasya': 34,
 'ka': 35,
 'hal': 36,
 'dar': 37,
 'decision': 38,
 'lo': 39,
 'dharma': 40,
 'ke': 41,
 'liye': 42,
 'khade': 43,
 'raho': 44,
 'galat': 45,
 'saamne': 46,
 'chup': 47,
 'rehna': 48,
 'bhi': 49,
 'apni': 50,
 'zimmedari': 51,
 'bhago': 52,
 'paristhiti': 53,
 'temporary': 54,
 'hoti': 55,
 'dukh': 56,
 'aata': 57,
 'chala': 58,
 'jaata': 59,
 'sukh': 60,
 'badalta': 61,
 'woh': 62,
 'satya': 63,
 'hota': 64,
 'aatma': 65,
 'kabhi': 66,
 'marti': 67,
 'shareer': 68,
 'nashwar': 69,
 'amar': 70,
 'ja': 71,
 'raha': 72,
 'pakadne': 7

In [12]:
len(vocab)

204

In [13]:
input_sentences = document.split('\n')

In [14]:
def text_to_indices(sentence, vocab):

  numerical_sentence = []

  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>'])

  return numerical_sentence


In [15]:
input_numerical_sentences = []

for sentence in input_sentences:
  input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))


In [16]:
len(input_numerical_sentences)

71

In [17]:
training_sequence = []
for sentence in input_numerical_sentences:

  for i in range(1, len(sentence)):
    training_sequence.append(sentence[:i+1])

In [18]:
len(training_sequence)

451

In [19]:
training_sequence[:5]

[[1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6]]

In [20]:
len_list = []

for sequence in training_sequence:
  len_list.append(len(sequence))

max(len_list)

12

In [21]:
training_sequence[0]

[1, 2]

In [22]:
padded_training_sequence = []
for sequence in training_sequence:

  padded_training_sequence.append([0]*(max(len_list) - len(sequence)) + sequence)

In [23]:
len(padded_training_sequence[10])

12

In [24]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [25]:
padded_training_sequence

tensor([[ 0,  0,  0,  ...,  0,  1,  2],
        [ 0,  0,  0,  ...,  1,  2,  3],
        [ 0,  0,  0,  ...,  2,  3,  4],
        ...,
        [ 0,  0,  0,  ..., 13,  3, 14],
        [ 0,  0,  0,  ...,  3, 14, 28],
        [ 0,  0,  0,  ..., 14, 28, 13]])

In [26]:
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:,-1]

In [27]:
X

tensor([[  0,   0,   0,  ...,   0,   0,   1],
        [  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        ...,
        [  0,   0,   0,  ..., 203,  13,   3],
        [  0,   0,   0,  ...,  13,   3,  14],
        [  0,   0,   0,  ...,   3,  14,  28]])

In [28]:
y

tensor([  2,   3,   4,   5,   6,   7,   2,   8,  10,  11,  12,  13,   3,  14,
          2,   8,  11,  12,  15,   3,  16,  17,  18,   8,  20,  21,   3,  22,
         23,  24,  25,   8,  27,  28,  29,  30,  31,   2,   8,  32,  33,  34,
         35,  36,  15,   8,  32,  38,   7,  39,   8,  41,  42,  43,  44,   8,
         41,  46,  47,  48,  49,  45,  13,   8,  51,  32,   7,  52,   8,  53,
         54,  55,  13,   8,  49,  57,  13,   3,  58,  49,  59,  13,   8,  49,
         57,  13,   3,  58,  49,  59,  13,   8,  61,  13,   3,  62,  63,  15,
         64,   8,  66,  67,  15,   8,  69,  13,   8,  70,  13,   8,  71,  72,
         13,   3,  16,  73,   5,  74,   7,   2,   8,  75,  72,  13,   3,  76,
         77,  21,   8,  79,  80,  81,  82,  13,   8,  79,  84,  85,  82,  13,
          8,  87,  15,   3,  88,  89,  13,   8,  32,  91,  92,  93,  94,  13,
          8,  32,  96,   3,  20,  48,  97,  13,   8,  32,  99,  56,  35, 100,
         13,   8, 101, 102, 103,  18,   8,  41, 104,   7, 105,  

In [29]:
class CustomDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [30]:
dataset = CustomDataset(X,y)

In [31]:
len(dataset)

451

In [32]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [33]:
class LSTMModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output

In [34]:
model = LSTMModel(len(vocab))

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [36]:
model.to(device)

LSTMModel(
  (embedding): Embedding(204, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=204, bias=True)
)

In [37]:
epochs = 50
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [38]:
# training loop

for epoch in range(epochs):
  total_loss = 0

  for batch_x, batch_y in dataloader:

    batch_x, batch_y = batch_x.to(device), batch_y.to(device)

    optimizer.zero_grad()

    output = model(batch_x)

    loss = criterion(output, batch_y)

    loss.backward()

    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")

Epoch: 1, Loss: 78.8082
Epoch: 2, Loss: 68.7965
Epoch: 3, Loss: 61.2350
Epoch: 4, Loss: 58.5772
Epoch: 5, Loss: 54.3442
Epoch: 6, Loss: 50.4760
Epoch: 7, Loss: 47.5215
Epoch: 8, Loss: 42.9104
Epoch: 9, Loss: 41.3246
Epoch: 10, Loss: 37.3016
Epoch: 11, Loss: 36.3039
Epoch: 12, Loss: 33.8987
Epoch: 13, Loss: 31.8856
Epoch: 14, Loss: 27.9623
Epoch: 15, Loss: 26.0276
Epoch: 16, Loss: 23.8528
Epoch: 17, Loss: 22.1571
Epoch: 18, Loss: 21.0534
Epoch: 19, Loss: 18.5350
Epoch: 20, Loss: 16.1364
Epoch: 21, Loss: 15.7142
Epoch: 22, Loss: 14.2167
Epoch: 23, Loss: 13.6456
Epoch: 24, Loss: 11.5543
Epoch: 25, Loss: 11.3589
Epoch: 26, Loss: 9.9606
Epoch: 27, Loss: 9.7168
Epoch: 28, Loss: 8.8845
Epoch: 29, Loss: 8.9935
Epoch: 30, Loss: 7.4683
Epoch: 31, Loss: 7.8186
Epoch: 32, Loss: 7.3589
Epoch: 33, Loss: 6.0470
Epoch: 34, Loss: 5.5495
Epoch: 35, Loss: 5.5762
Epoch: 36, Loss: 5.2767
Epoch: 37, Loss: 5.0169
Epoch: 38, Loss: 4.7331
Epoch: 39, Loss: 4.4253
Epoch: 40, Loss: 4.3462
Epoch: 41, Loss: 4.0677


In [39]:
# prediction

def prediction(model, vocab, text):

  # tokenize
  tokenized_text = word_tokenize(text.lower())

  # text -> numerical indices
  numerical_text = text_to_indices(tokenized_text, vocab)

  # padding
  padded_text = torch.tensor([0] * (61 - len(numerical_text)) + numerical_text, dtype=torch.long).unsqueeze(0)

  # send to model
  output = model(padded_text)

  # predicted index
  value, index = torch.max(output, dim=1)

  # merge with text
  return text + " " + list(vocab.keys())[index]



In [48]:
prediction(model, vocab, "Har kaam bhagwan ko samarpit")

'Har kaam bhagwan ko samarpit karke'

In [49]:
import time

num_tokens = 10
input_text = "Har kaam "

for i in range(num_tokens):
  output_text = prediction(model, vocab, input_text)
  print(output_text)
  input_text = output_text
  time.sleep(0.5)


Har kaam  bhagwan
Har kaam  bhagwan ko
Har kaam  bhagwan ko samarpit
Har kaam  bhagwan ko samarpit karke
Har kaam  bhagwan ko samarpit karke karo
Har kaam  bhagwan ko samarpit karke karo .
Har kaam  bhagwan ko samarpit karke karo . .
Har kaam  bhagwan ko samarpit karke karo . . .
Har kaam  bhagwan ko samarpit karke karo . . . .
Har kaam  bhagwan ko samarpit karke karo . . . . .


In [46]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

In [47]:
# Function to calculate accuracy
def calculate_accuracy(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # No need to compute gradients
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Get model predictions
            outputs = model(batch_x)

            # Get the predicted word indices
            _, predicted = torch.max(outputs, dim=1)

            # Compare with actual labels
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy

# Compute accuracy
accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 94.46%
