In [1]:
import pandas as pd
import numpy as np
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from sklearn.model_selection import train_test_split
from tqdm import tqdm

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


device(type='cuda')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load dataset
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Dataset/news_summary.csv", encoding='latin1')

In [4]:
# Preprocessing function
def preprocess_text(df, col):
    # converting language data in data frame to lower case and then storing in sentence variable
    sentence = df[col].str.lower()
    sentence = sentence.str.replace('[^0-9A-Za-z\s]+', '', regex=True)
    sentence = sentence.str.normalize('NFD')
    #encoding the string in sentence in UTF-8 format and ignoring errors if any
    sentence = sentence.str.encode('ascii', errors='ignore').str.decode('utf-8')
    return sentence

In [5]:
dataset['headlines'] = preprocess_text(dataset, 'headlines')
dataset['text'] = preprocess_text(dataset, 'text')

In [6]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

# make the token 1 and 2 ,0 is already reserved for the [pad]
class Vocab:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0:'PAD',1: "SOS", 2: "EOS"}
        self.n_words = 3  # Count SOS and EOS

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [7]:
vocab = Vocab()

_ = dataset.text.apply(lambda x: vocab.add_sentence(x))
_ = dataset.headlines.apply(lambda x: vocab.add_sentence(x))

In [8]:
vocab.n_words

120908

In [9]:
dataset['text_length'] = dataset.text.str.split(' ').apply(lambda x: len(x))
dataset['headlines_length'] = dataset.headlines.str.split(' ').apply(lambda x: len(x))

In [10]:
dataset.headlines_length.max(), dataset.text_length.max()

(18, 92)

In [11]:
MAX_LENGTH_INPUT = 100
MAX_LENGTH_TARGET = 20

In [12]:
def indexes_from_sentence(vocab, sentence):
    return [vocab.word2index[word] for word in sentence.split(' ')]

In [13]:
def tensor_from_sentence(vocab, sentence):
    indexes = indexes_from_sentence(vocab, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

In [14]:
def get_dataloader(dataset, batch_size):
    n = dataset.shape[0]
    input_ids = np.zeros((n, MAX_LENGTH_INPUT), dtype=np.int64)
    target_ids = np.zeros((n, MAX_LENGTH_TARGET), dtype=np.int64)

    for idx in range(n):
        inp_ids = indexes_from_sentence(vocab, dataset.text.iloc[idx])
        tgt_ids = indexes_from_sentence(vocab, dataset.headlines.iloc[idx])

        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)

        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return train_dataloader

In [15]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        # Update to use LSTM
        output, (hidden, cell) = self.lstm(embedded)
        return output, (hidden, cell)

In [16]:
enc = Encoder(100, 64)
print(enc)

Encoder(
  (embedding): Embedding(100, 64)
  (lstm): LSTM(64, 64, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [17]:
x = torch.randint(1, 100, (1, 61))

In [18]:
enc_outputs, enc_hidden = enc.forward(x)

In [19]:
enc_hidden_h_shape = enc_hidden[0].shape
enc_hidden_c_shape = enc_hidden[1].shape
enc_outputs_shape = enc_outputs.shape

print("enc_outputs_shape:", enc_outputs_shape)
print("enc_hidden_h_shape:", enc_hidden_h_shape)
print("enc_hidden_c_shape:", enc_hidden_c_shape)

enc_outputs_shape: torch.Size([1, 61, 64])
enc_hidden_h_shape: torch.Size([1, 1, 64])
enc_hidden_c_shape: torch.Size([1, 1, 64])


In [20]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.attention = nn.Linear(hidden_size * 2, 1)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)
        attn_energies = torch.zeros(batch_size, seq_len, device=encoder_outputs.device)

        for i in range(seq_len):
            attn_energies[:, i] = self.score(hidden, encoder_outputs[:, i, :])

        #this is the alpha
        attn_weights = F.softmax(attn_energies, dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)  # Compute the context vector
        return context, attn_weights

    def score(self, hidden, encoder_output):
        if hidden.dim() == 1:
            hidden = hidden.unsqueeze(0)
        if encoder_output.dim() == 1:
            encoder_output = encoder_output.unsqueeze(0)

        combined = torch.cat((hidden, encoder_output), dim=1)
        energy = self.attention(combined)
        return energy.squeeze(1)


**Explanation-**

1. **Initialization**:
   - The class `BahdanauAttention` is created as a subclass of `nn.Module`.
   - It initializes a linear layer (`self.attention`) with input size `hidden_size * 2` and output size 1. This linear layer is used to compute attention scores.

2. **Forward Pass**:
   - During the forward pass, the method takes two inputs: `hidden` state from the decoder and `encoder_outputs` from the encoder.
   - It iterates through the sequence length of `encoder_outputs` to calculate attention scores for each time step.

3. **Score Calculation**:
   - The `score` method concatenates the `hidden` state and the `encoder_output` for each time step.
   - The concatenated tensor is passed through the linear layer (`self.attention`) to compute attention energies.

4. **Attention Weights**:
   - Softmax is applied to the computed attention energies across the sequence dimension (`dim=1`) to obtain attention weights.
   - Softmax ensures that the attention weights sum up to 1, representing the importance of each encoder output for the current decoding step.

5. **Context Vector**:
   - Using the computed attention weights, a context vector is calculated as the weighted sum of encoder outputs.
   - This context vector captures relevant information from the encoder outputs based on their importance determined by the attention weights.

6. **Return**:
   - Finally, the method returns the context vector and attention weights, providing valuable information for the decoder to generate the next output word.

Overall, the Bahdanau Attention mechanism enables the decoder to focus on different parts of the input sequence dynamically during the decoding process, enhancing the model's ability to generate accurate and contextually relevant outputs.

In [21]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size).to(device)
        self.lstm = nn.LSTM(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)

        decoder_hidden, decoder_cell = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH_TARGET):
            decoder_output, (decoder_hidden, decoder_cell), attn_weights = self.forward_step(
                decoder_input, (decoder_hidden, decoder_cell), encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1)
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions

    def forward_step(self, input, hidden_state, encoder_outputs):
        hidden, cell = hidden_state
        embedded = self.dropout(self.embedding(input))  # Shape: [batch_size, 1, hidden_size]

        hidden_state_permuted = hidden.permute(1, 0, 2)[:, -1, :]  # Select only the last layer's hidden state, Shape: [batch_size, hidden_size]
        context, attn_weights = self.attention(hidden_state_permuted, encoder_outputs)  # Context should be [batch_size, 1, hidden_size]

        # Make sure context is not squeezed excessively
        context = context.squeeze(1)  # Correct context shape if necessary. Shape should be [batch_size, hidden_size]
        context = context.unsqueeze(1)  # Maintain the sequence length dimension for LSTM input

        input_lstm = torch.cat((embedded, context), dim=2)  # Concatenate along the feature dimension
        output, (hidden, cell) = self.lstm(input_lstm, (hidden, cell))
        output = self.out(output)

        return output, (hidden, cell), attn_weights


**Explanation-**
Let's break down the `AttnDecoderRNN` class and its methods in chronological order:

1. **Initialization**:
   - The constructor initializes the decoder with an embedding layer (`self.embedding`), Bahdanau attention mechanism (`self.attention`), LSTM layer (`self.lstm`), linear layer for output (`self.out`), and dropout (`self.dropout`).

2. **Forward Pass**:
   - During the forward pass, it takes encoder outputs (`encoder_outputs`), encoder hidden states (`encoder_hidden`), and optional target tensor (`target_tensor`) as inputs.
   - It initializes the decoder input with the start-of-sequence token (`SOS_token`).
   - Decoder hidden and cell states are extracted from the encoder hidden states.
   - It iterates over the maximum length of the target sequence.
   
3. **Forward Step**:
   - Inside the loop, for each timestep:
     - The `forward_step` method is called to compute the output, new hidden and cell states, and attention weights.
     - In `forward_step`, the decoder input is embedded using the embedding layer, and dropout is applied.
     - The previous hidden state is permuted to match the dimensions for attention calculation, and the context vector and attention weights are computed using the Bahdanau attention mechanism.
     - The context vector is concatenated with the embedded input, and passed through the LSTM layer.
     - The LSTM output is passed through the linear layer to obtain the output probabilities for the current timestep.
   
4. **Output Processing**:
   - The outputs, hidden states, and attention weights for each timestep are collected and stored.
   - If a target tensor is provided, the next input token is extracted from it; otherwise, the token with the highest probability is selected.
   
5. **Finalization**:
   - The decoder outputs, which contain the log probabilities of output tokens for each timestep, are concatenated along the sequence dimension and processed using a softmax function.
   - The attention weights are concatenated similarly.
   - Finally, the decoder outputs, hidden states, and attention weights are returned as outputs of the forward pass.

This comprehensive process enables the `AttnDecoderRNN` to decode input sequences with attention, allowing it to produce accurate and contextually relevant output sequences.

In [22]:
tgt_tensor = torch.randint(1, 100, (1, 20))
tgt_tensor.shape

torch.Size([1, 20])

In [23]:
dec = AttnDecoderRNN(64, 100).to(device)

In [24]:

# Move your models to the designated device
encoder = enc.to(device)
decoder = dec.to(device)


In [25]:
# When you load or create tensors, send them to the same device
enc_outputs = enc_outputs.to(device)
(h,c) = enc_hidden
enc_hidden_gpu=h.to(device),c.to(device)
tgt_tensor = tgt_tensor.to(device)
print(h.shape,c.shape)

torch.Size([1, 1, 64]) torch.Size([1, 1, 64])


In [26]:
decoder_outputs, decoder_hidden, attentions = dec.forward(enc_outputs, enc_hidden_gpu, tgt_tensor)


In [27]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    total_loss = 0
    for input_tensor, target_tensor in tqdm(dataloader):
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, attentions = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def evaluate_model(dataloader, encoder, decoder, criterion):
    total_loss = 0
    with torch.no_grad():
        for input_tensor, target_tensor in tqdm(dataloader):
            encoder_outputs, encoder_hidden = encoder(input_tensor)
            decoder_outputs, decoder_hidden, attentions = decoder(encoder_outputs, encoder_hidden, target_tensor)

            loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                target_tensor.view(-1)
            )
            total_loss += loss.item()

    return total_loss / len(dataloader)

def train_model(train_dataloader, valid_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
                print_every=100, plot_every=100):
    print_loss_total = 0  # Reset every print_every
    train_losses = []
    valid_losses = []

    best_val_loss = float('inf')

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        print(f"Epoch: {epoch}/{n_epochs}")
        # Training
        train_loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer,
                                 decoder_optimizer, criterion)
        print_loss_total += train_loss
        train_losses.append(train_loss)

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(f"Train Loss: {round(print_loss_avg, 3)}")

        # Validation
        print('Validation....')
        valid_loss = evaluate_model(valid_dataloader, encoder, decoder, criterion)
        valid_losses.append(valid_loss)
        print(f"Validation Loss: {round(valid_loss, 3)}")

        # Save the model if it has the best validation loss so far
        if valid_loss < best_val_loss:
            best_val_loss = valid_loss
            torch.save(encoder.state_dict(), 'best_encoder.pth')
            torch.save(decoder.state_dict(), 'best_decoder.pth')
            print(f"Saved Best Model at Epoch: {epoch}")

    return train_losses, valid_losses


In [28]:
train_dataset, test_dataset = train_test_split(dataset, shuffle=True, test_size=0.2, random_state=42)
train_dataset, val_dataset = train_test_split(train_dataset, shuffle=True, test_size=0.1, random_state=42)

print(f"Train set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Train set size: 70848
Validation set size: 7872
Test set size: 19681


In [None]:
hidden_size = 256
batch_size = 64
n_epochs = 5
print('Making DataLoaders .... .....  ')
train_dataloader = get_dataloader(train_dataset, batch_size)
val_dataloader=get_dataloader(val_dataset,batch_size)
print('Defining Encoder and Decoder .....')
encoder = Encoder(vocab.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, vocab.n_words).to(device)
train_loss,val_loss=train_model(train_dataloader, val_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=1, plot_every=100)

In [1]:
def evaluate_test_samples(encoder, decoder, sentence, vocab):
    with torch.no_grad():
        input_tensor = tensor_from_sentence(vocab, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, attentions = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(vocab.index2word[idx.item()])

    return decoded_words

In [2]:
def evaluateRandomly_train(encoder, decoder, vocab, n=10):
    for i in range(n):
        print(i)
        eval_sample = train_dataset.iloc[i:i+1, :]
        print('news_article > ', eval_sample['text'].iloc[0])
        headline = eval_sample['headlines'].iloc[0]
        print('original_headline = ', headline)
        output_words = evaluate_test_samples(encoder, decoder, eval_sample.text.iloc[0], vocab)
        output_sentence = ' '.join(output_words)
        print('predicted_headline < ', output_sentence)
        print('')
        print(f"meteor score: {nltk.translate.meteor_score.single_meteor_score(headline.split(), output_sentence.split())}")

In [3]:
encoder.eval()
decoder.eval()

evaluateRandomly_train(encoder, decoder, vocab)

NameError: name 'encoder' is not defined

In [None]:
def evaluateRandomly_test(encoder, decoder, vocab, n=10):
    for i in range(n):
        print(i)
        eval_sample = test_dataset.iloc[i:i+1, :]
        print('news_article > ', eval_sample['text'].iloc[0])
        headline = eval_sample['headlines'].iloc[0]
        print('original_headline = ', headline)
        output_words = evaluate_test_samples(encoder, decoder, eval_sample.text.iloc[0], vocab)
        output_sentence = ' '.join(output_words)
        print('predicted_headline < ', output_sentence)
        print('')
        print(f"meteor score: {nltk.translate.meteor_score.single_meteor_score(headline.split(), output_sentence.split())}")

In [None]:
encoder.eval()
decoder.eval()

evaluateRandomly_test(encoder, decoder, vocab)

0
news_article >  students in karnataka will get extra marks if their parents cast votes in the upcoming assembly elections the associated management of primary and secondary schools has announced the encouraging marks will be added in the 201819 academic year the association said after casting their votes parents can visit member schoolsand confirm that they voted by showing the indelible ink mark
original_headline =  ktaka students to get extra marks if parents vote in polls
predicted_headline <  sc to kerala <EOS>

meteor score: 0.04854368932038835
1
news_article >  syrian antiaircraft defences on monday shot down missiles over two air bases syrias state media said the missiles targeted shayrat air base in the homs province and another base northeast of the capital damascus this comes days after the us uk and france launched air strikes on syrian chemical weapons facilities in retaliation for the alleged chemical attack in douma
original_headline =  syria shoots down missiles fired 