<a href="https://colab.research.google.com/github/Adithyan-mp/Sequence_Model/blob/main/RNNLanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets



In [None]:
import torch
from torch import nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import re
from collections import Counter

In [None]:
dataset = load_dataset("bookcorpus", split='train[:1000]')

texts = dataset['text']
def tokenization(text):
  text = re.sub(r"\s+"," ",text)
  text = text.split()
  text.append("<EOS>")
  return text

tokens = [tokenization(text) for text in texts]
print(texts[0],"|" , tokens[0])

all_tokens = [token for sentence in tokens for token in sentence]
counter = Counter(all_tokens)

vocab = {token : idx+2 for idx,(token, _ ) in enumerate(counter.most_common())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

print(vocab)

def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

encoded_token = [torch.tensor(tokens_to_indices(sentence, vocab)) for sentence in tokens]
print(encoded_token[0:2])
padded_token = pad_sequence(encoded_token,batch_first=True,padding_value=0)
print(padded_token[0].size())

class CustomDataset(Dataset):
    def __init__(self, inputs, labels, transform=None):
        self.inputs = inputs
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_item = self.inputs[index]
        label_item = self.labels[index]

        if self.transform:
            input_item, label_item = self.transform(input_item, label_item)

        return input_item, label_item

class RNN(nn.Module):
    def __init__(self, hidden_dim, vocab_size, embedding_dim=64):
        super(RNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        self.h1 = nn.Linear(hidden_dim + embedding_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        batch_size, seq_len = x.size()
        prev_ac = torch.zeros((batch_size, self.hidden_dim), device=x.device)
        outputs = []

        x_embed = self.embedding(x)

        for i in range(seq_len):
            token = x_embed[:, i, :]
            combined = torch.cat((token, prev_ac), dim=1)
            prev_ac = torch.tanh(self.h1(combined))
            logits = self.out(prev_ac)
            outputs.append(logits)

        return torch.stack(outputs, dim=1)



usually , he would be tearing around the living room , playing with his toys . | ['usually', ',', 'he', 'would', 'be', 'tearing', 'around', 'the', 'living', 'room', ',', 'playing', 'with', 'his', 'toys', '.', '<EOS>']
{'<EOS>': 2, '.': 3, ',': 4, "''": 5, '``': 6, 'the': 7, 'to': 8, 'a': 9, 'she': 10, 'he': 11, 'her': 12, 'and': 13, 'i': 14, 'you': 15, "'s": 16, 'was': 17, 'of': 18, 'his': 19, 'in': 20, 'megan': 21, '?': 22, 'with': 23, "n't": 24, 'that': 25, 'him': 26, 'it': 27, 'aidan': 28, 'emma': 29, 'had': 30, 'pesh': 31, 'at': 32, 'as': 33, 'for': 34, 'on': 35, 'up': 36, 'noah': 37, 'be': 38, 'do': 39, 'just': 40, 'me': 41, 'could': 42, 'when': 43, 'have': 44, 'is': 45, 'like': 46, 'my': 47, 'by': 48, 'what': 49, 'mason': 50, 'before': 51, 'casey': 52, 'out': 53, "'m": 54, 'would': 55, 'time': 56, 'about': 57, 'not': 58, 'all': 59, 'did': 60, 'so': 61, 'but': 62, 'over': 63, 'into': 64, 'they': 65, 'were': 66, 'asked': 67, 'head': 68, 'if': 69, 'been': 70, 'are': 71, 'after': 72,

In [None]:
input_seqs = []
target_seqs = []

for seq in encoded_token:
    if len(seq) < 2:
        continue
    input_seqs.append(seq[:-1])
    target_seqs.append(seq[1:])
input_padded = pad_sequence(input_seqs, batch_first=True, padding_value=0)
target_padded = pad_sequence(target_seqs, batch_first=True, padding_value=0)
dataset = CustomDataset(input_padded, target_padded)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RNN(hidden_dim=128, vocab_size=len(vocab), embedding_dim=64).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 150

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_inputs, batch_targets in loader:
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)

        optimizer.zero_grad()
        outputs = model(batch_inputs)  # (batch_size, seq_len, vocab_size)

        loss = criterion(outputs.view(-1, len(vocab)), batch_targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


Epoch 1/150, Loss: 6.8072
Epoch 2/150, Loss: 5.4652
Epoch 3/150, Loss: 5.2195
Epoch 4/150, Loss: 5.0286
Epoch 5/150, Loss: 4.8363
Epoch 6/150, Loss: 4.6689
Epoch 7/150, Loss: 4.5225
Epoch 8/150, Loss: 4.3792
Epoch 9/150, Loss: 4.2505
Epoch 10/150, Loss: 4.1240
Epoch 11/150, Loss: 4.0083
Epoch 12/150, Loss: 3.8913
Epoch 13/150, Loss: 3.7907
Epoch 14/150, Loss: 3.6906
Epoch 15/150, Loss: 3.5895
Epoch 16/150, Loss: 3.5007
Epoch 17/150, Loss: 3.4111
Epoch 18/150, Loss: 3.3216
Epoch 19/150, Loss: 3.2352
Epoch 20/150, Loss: 3.1438
Epoch 21/150, Loss: 3.0662
Epoch 22/150, Loss: 2.9925
Epoch 23/150, Loss: 2.9141
Epoch 24/150, Loss: 2.8392
Epoch 25/150, Loss: 2.7623
Epoch 26/150, Loss: 2.6905
Epoch 27/150, Loss: 2.6249
Epoch 28/150, Loss: 2.5530
Epoch 29/150, Loss: 2.4780
Epoch 30/150, Loss: 2.4181
Epoch 31/150, Loss: 2.3600
Epoch 32/150, Loss: 2.2967
Epoch 33/150, Loss: 2.2385
Epoch 34/150, Loss: 2.1833
Epoch 35/150, Loss: 2.1252
Epoch 36/150, Loss: 2.0631
Epoch 37/150, Loss: 2.0080
Epoch 38/1

In [None]:
def generate_text(model, vocab, idx_to_token, start_tokens, max_len=20):
    model.eval()
    input_ids = [vocab.get(token, vocab['<UNK>']) for token in start_tokens]
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(model.embedding.weight.device)

    generated = start_tokens.copy()
    prev_ac = torch.zeros((1, model.hidden_dim), device=input_tensor.device)

    with torch.no_grad():
        x_embed = model.embedding(input_tensor)

        # Feed existing tokens
        for i in range(x_embed.size(1)):
            token_emb = x_embed[:, i, :]
            combined = torch.cat((token_emb, prev_ac), dim=1)
            prev_ac = torch.tanh(model.h1(combined))

        # Generate new tokens
        for _ in range(max_len):
            logits = model.out(prev_ac)
            next_token_id = torch.argmax(logits, dim=1).item()
            next_token = idx_to_token.get(next_token_id, '<UNK>')
            generated.append(next_token)

            if next_token == "<EOS>":
                break

            # Prepare next input embedding
            token_emb = model.embedding(torch.tensor([[next_token_id]], device=input_tensor.device)).squeeze(1)
            combined = torch.cat((token_emb, prev_ac), dim=1)
            prev_ac = torch.tanh(model.h1(combined))

    return ' '.join(generated)


In [None]:
prompt = ["he", "was"]
idx_to_token = {idx: token for token, idx in vocab.items()}
generated_text = generate_text(model, vocab, idx_to_token, prompt)
print("Generated:", generated_text)


Generated: he was taken aback by her words and the passion with which she delivered them . <EOS>
