<a href="https://colab.research.google.com/github/Busa-sathwika/NLP/blob/main/Asgn8_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Example text data (you can replace this with any larger corpus) text = """ Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods. One day, her mother asked her to take a basket of goodies to her grandmother. On her way through the woods, she met a big bad wolf who wanted to eat her.

(i) Build the Transformer Model on above dataset

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import string

# Sample text data
text = """Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods. One day, her mother asked her to take a basket of goodies to her grandmother. On her way through the woods, she met a big bad wolf who wanted to eat her."""

# Tokenize and clean the text
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text.split()

# Tokenize text
tokens = clean_text(text)

# Create vocabulary (mapping from words to indices)
word_counts = Counter(tokens)
vocab = {word: idx for idx, (word, _) in enumerate(word_counts.items())}
reverse_vocab = {idx: word for word, idx in vocab.items()}

# Convert tokens to indices
token_indices = [vocab[word] for word in tokens]

class TextDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx+self.seq_len])
        y = torch.tensor(self.data[idx+1:idx+self.seq_len+1])
        return x, y

# Hyperparameters
seq_len = 10
dataset = TextDataset(token_indices, seq_len)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


Model Architecture:

In [2]:
import torch.nn as nn

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, seq_len, hidden_size):
        super(TransformerModel, self).__init__()

        self.embed = nn.Embedding(vocab_size, embed_size)
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=hidden_size
        )
        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.seq_len = seq_len

    def forward(self, x):
        x = self.embed(x)  # Shape: [batch_size, seq_len, embed_size]
        x = x.permute(1, 0, 2)  # Transformer expects [seq_len, batch_size, embed_size]
        x = self.transformer(x, x)  # Use the same input as both source and target
        x = x.permute(1, 0, 2)  # [batch_size, seq_len, embed_size]
        x = self.fc_out(x)
        return x

# Instantiate the model
vocab_size = len(vocab)
embed_size = 128
num_heads = 8
num_layers = 6
hidden_size = 512

model = TransformerModel(vocab_size, embed_size, num_heads, num_layers, seq_len, hidden_size)




(ii) Train the model using 20, 60, 70 epochs

In [3]:
import torch.optim as optim

# Hyperparameters
epochs = 20
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
def train(model, dataloader, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x_batch, y_batch in dataloader:
            optimizer.zero_grad()
            output = model(x_batch)  # Get predictions
            loss = criterion(output.view(-1, vocab_size), y_batch.view(-1))  # Compute loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}')

train(model, dataloader, epochs=20)  # Training for 20 epochs


Epoch [1/20], Loss: 3.6344
Epoch [2/20], Loss: 3.5129
Epoch [3/20], Loss: 3.4692
Epoch [4/20], Loss: 3.3367
Epoch [5/20], Loss: 3.2799
Epoch [6/20], Loss: 3.2440
Epoch [7/20], Loss: 3.3320
Epoch [8/20], Loss: 3.3150
Epoch [9/20], Loss: 3.3030
Epoch [10/20], Loss: 3.2605
Epoch [11/20], Loss: 3.1596
Epoch [12/20], Loss: 3.3501
Epoch [13/20], Loss: 3.3134
Epoch [14/20], Loss: 3.2808
Epoch [15/20], Loss: 3.2804
Epoch [16/20], Loss: 3.2848
Epoch [17/20], Loss: 3.2757
Epoch [18/20], Loss: 3.2675
Epoch [19/20], Loss: 3.2745
Epoch [20/20], Loss: 3.2618


(iii) After training, use the model to generate new text by feeding it an initial seed text

In [4]:
# Add '<unk>' token to the vocabulary
UNK_TOKEN = '<unk>'
vocab = {word: idx + 1 for idx, (word, _) in enumerate(word_counts.items())}  # Start from 1
vocab[UNK_TOKEN] = 0  # Assign 0 to '<unk>' token
reverse_vocab = {idx: word for word, idx in vocab.items()}


In [5]:
import torch
import random
import numpy as np

def generate_text(model, seed_text, length=50, temperature=1.0, top_k=50, top_p=0.9):
    model.eval()

    # Preprocess the seed text
    seed_tokens = clean_text(seed_text)
    seed_indices = [vocab.get(word, vocab['<unk>']) for word in seed_tokens]  # Convert to indices
    input_tensor = torch.tensor(seed_indices).unsqueeze(0)  # Add batch dimension

    generated = seed_tokens.copy()

    # Define a function for temperature scaling
    def temperature_sampling(logits, temperature):
        logits = logits / temperature
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        return probabilities

    # Define a function for top-k sampling
    def top_k_sampling(logits, k):
        top_k_values, top_k_indices = torch.topk(logits, k)
        probabilities = torch.nn.functional.softmax(top_k_values, dim=-1)
        return top_k_indices, probabilities

    # Define a function for top-p (nucleus) sampling
    def top_p_sampling(logits, p):
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)

        # Cut off the tail of the distribution
        sorted_indices_to_keep = cumulative_probs <= p
        sorted_logits[~sorted_indices_to_keep] = -float('Inf')
        probabilities = torch.nn.functional.softmax(sorted_logits, dim=-1)

        return sorted_indices, probabilities

    # Start generating tokens
    for _ in range(length):
        output = model(input_tensor)
        logits = output[:, -1, :]  # Get the logits for the last token prediction

        # Apply temperature scaling
        logits = temperature_sampling(logits, temperature)

        # Sample from top-k or top-p (nucleus) sampling
        if top_p > 0:
            top_indices, probabilities = top_p_sampling(logits, top_p)
        else:
            top_indices, probabilities = top_k_sampling(logits, top_k)

        # Sample the next token from the distribution
        next_token_idx = torch.multinomial(probabilities, 1).item()

        # Get the word for the next token using the sampled index
        next_word = reverse_vocab.get(top_indices[0, next_token_idx].item(), vocab['<unk>'])
        generated.append(next_word)

        # Update input for next iteration (use the predicted token index)
        input_tensor = torch.cat([input_tensor[:, 1:], torch.tensor([[top_indices[0, next_token_idx]]])], dim=1)

    return ' '.join(generated)


# Example generation
seed_text = "Once upon a time"
generated_text = generate_text(model, seed_text, length=50, temperature=0.7, top_k=50, top_p=0.9)
print(generated_text)


once upon a time mother hood in loved girl take asked riding named woods met she upon her on the in goodies wolf her lived way upon lived riding who big who girl take little way lived day loved lived asked grandmother girl little goodies met through wanted who goodies bad goodies bad met


(iv) Experimenting and Improving the Model by large dataset and hyper tune parameter.


In [6]:
from torch.optim.lr_scheduler import StepLR

# Modify the TransformerModel to include dropout
class TransformerModelWithDropout(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, seq_len, hidden_size, dropout_rate=0.1):
        super(TransformerModelWithDropout, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=hidden_size,
            dropout=dropout_rate  # Add dropout here
        )
        self.fc_out = nn.Linear(embed_size, vocab_size)

# Define the scheduler
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)  # Decay learning rate by 0.1 every 10 epochs
