In [1]:
#Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

## Load the Dataset

In [2]:
import json

# Read the file
with open('/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json') as f:
    data = [json.loads(line) for line in f]

#Convert that into Dataframe or easier inspection
df = pd.DataFrame(data)

# View the first item
df = df.head(50000)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
# Total Number of data
len(df)

50000

In [4]:
#Take only headline and short_description
df = df[['headline', 'short_description']]

print("__HEADLINE__")
print(df['headline'][0])
print("__SHORT DESCRIPTION__")
print(df['short_description'][0])
print('-' * 110)
print("__HEADLINE__")
print(df['headline'][10])
print("__SHORT DESCRIPTION__")
print(df['short_description'][10])
print('-' * 110)
print("__HEADLINE__")
print(df['headline'][20])
print("__SHORT DESCRIPTION__")
print(df['short_description'][20])

__HEADLINE__
Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters
__SHORT DESCRIPTION__
Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.
--------------------------------------------------------------------------------------------------------------
__HEADLINE__
World Cup Captains Want To Wear Rainbow Armbands In Qatar
__SHORT DESCRIPTION__
FIFA has come under pressure from several European soccer federations who want to support a human rights campaign against discrimination at the World Cup.
--------------------------------------------------------------------------------------------------------------
__HEADLINE__
Golden Globes Returning To NBC In January After Year Off-Air
__SHORT DESCRIPTION__
For the past 18 months, Hollywood has effectively boycotted the Globes after reports that the HFPA’s 87 members of non-American journalists included no Black members.


## Cleaning the Text

In [5]:
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation and special characters (except words and spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['headline'] = df['headline'].apply(clean_text)
df['short_description'] = df['short_description'].apply(clean_text)

print("__HEADLINE__")
print(df['headline'][0])
print("__SHORT DESCRIPTION__")
print(df['short_description'][0])
print('-' * 110)
print("__HEADLINE__")
print(df['headline'][10])
print("__SHORT DESCRIPTION__")
print(df['short_description'][10])
print('-' * 110)
print("__HEADLINE__")
print(df['headline'][20])
print("__SHORT DESCRIPTION__")
print(df['short_description'][20])

__HEADLINE__
over 4 million americans roll up sleeves for omicrontargeted covid boosters
__SHORT DESCRIPTION__
health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the us ordered for the fall
--------------------------------------------------------------------------------------------------------------
__HEADLINE__
world cup captains want to wear rainbow armbands in qatar
__SHORT DESCRIPTION__
fifa has come under pressure from several european soccer federations who want to support a human rights campaign against discrimination at the world cup
--------------------------------------------------------------------------------------------------------------
__HEADLINE__
golden globes returning to nbc in january after year offair
__SHORT DESCRIPTION__
for the past 18 months hollywood has effectively boycotted the globes after reports that the hfpas 87 members of nonamerican journalists included no black members


## Build the Vocab

In [6]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from collections import Counter

def tokenize(text):
    return word_tokenize(text.lower())

def build_vocab(texts, min_freq=1):
    """
    Builds vocabulary with proper special tokens.
    """
    all_tokens = []
    for text in texts:
        tokens = tokenize(text)
        all_tokens.extend(tokens)
    
    # Count word frequencies
    word_freq = Counter(all_tokens)
    
    # Filter words by min frequency
    filtered_words = [word for word, freq in word_freq.items() if freq >= min_freq]
    
    # Build vocab with special tokens
    word2idx = {'<PAD>': 0, '<UNK>': 1, '<BOS>': 2, '<EOS>': 3}
    
    for idx, word in enumerate(filtered_words):
        word2idx[word] = idx + 4  # Start after special tokens
    
    # Reverse mapping
    idx2word = {idx: word for word, idx in word2idx.items()}
    
    return word2idx, idx2word, word_freq

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
texts = df['headline'].tolist() + df['short_description'].tolist()
word2idx, idx2word, word_freq = build_vocab(texts, min_freq=1)

print("Vocab size:", len(word2idx))
print("Top 10 words:", word_freq.most_common(10))

Vocab size: 49302
Top 10 words: [('the', 59090), ('to', 33483), ('a', 26303), ('of', 24966), ('in', 20948), ('and', 18685), ('for', 14349), ('is', 13008), ('on', 11245), ('trump', 9621)]


In [8]:
#Check if there is a null values
df.isna().sum()

headline             0
short_description    0
dtype: int64

In [9]:
from nltk.tokenize import word_tokenize

# Modified encode_text to use BOS/EOS tokens
def encode_text(text, word2idx, max_len=32, add_special_tokens=True):
    """
    Encode text with optional BOS/EOS tokens
    """
    tokens = word_tokenize(text.lower())
    
    if add_special_tokens:
        ids = [word2idx['<BOS>']]  # Start with BOS
        ids.extend([word2idx.get(token, word2idx['<UNK>']) for token in tokens])
        ids.append(word2idx['<EOS>'])  # End with EOS
    else:
        ids = [word2idx.get(token, word2idx['<UNK>']) for token in tokens]
    
    # Truncate if too long, else pad
    if len(ids) > max_len:
        ids = ids[:max_len]
    else:
        ids += [word2idx['<PAD>']] * (max_len - len(ids))
    
    return ids

## Make the Dataset

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class SummarizationDataset(Dataset):
    def __init__(self, headline, short_description, word2idx, max_len=32):
        """
        articles: list of article texts
        highlights: list of summary texts
        word2idx: vocabulary dictionary
        max_len: max length for padding/truncation
        """
        self.headline = headline
        self.short_description = short_description
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.headline)

    def __getitem__(self, idx):
        # Get the raw article and summary
        headline = self.headline[idx]
        short_description = self.short_description[idx]

        # Encode both
        input_ids = encode_text(short_description, self.word2idx, self.max_len)
        target_ids = encode_text(headline, self.word2idx, self.max_len)

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "target_ids": torch.tensor(target_ids, dtype=torch.long)
        }

In [11]:
import torch.nn as nn

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

Using device: cuda


## Input Embeddings

Let's focus on the Encoder Part for now:

### Transformer Encoder

At first you can see that we have Input Embedding and the Positional Encoding so let's talk about that,

**Embedding** -> So, we know that the first thing we do is tokenize and we recieve the set's of discrete tokens and embedding's job is to change the set of discrete tokens into the continous vector representation.

Why the need to do this?
Because, Transformer is the neural network and they understand the numbers and not the words, so we need to change them to the numerical representation forms such that they captures the semantic meaning and context.

The transformer architecture starts with embedding sequences as vectors, and then encoding each token's position in the sequence so that tokens can be processed in parallel.

Suppose, we have three tokens 
["Cat", "Dog", "Fish"]

We know that each token have their own unique ID in the modle vocab which the model recognize. If we embed them using the embedding layers we get the embedding vector.The length of this vector is also referred to as the number of dimensions, or dimensionality. 

In [12]:
import torch
import torch.nn as nn
import math

class InputEmbeddings(nn.Module):
    """
    Converts token indices into dense vector embeddings and scales them.

    Args:
        vocab_size (int): Size of the vocabulary (number of unique tokens).
        d_model (int): Dimensionality of the embedding vectors (also the model's hidden size).
    """
    def __init__(self, vocab_size: int, d_model: int) -> None:
        super().__init__()

        self.d_model = d_model                  # Embedding dimension (same as model hidden size)
        self.vocab_size = vocab_size            # Total number of tokens in vocabulary
        self.embedding = nn.Embedding(vocab_size, d_model)  # Learnable embedding table

    def forward(self, x):
        """
        Args:
            x (Tensor): Tensor of token indices of shape (batch_size, seq_len)

        Returns:
            Tensor: Embedded and scaled tensor of shape (batch_size, seq_len, d_model)
        """
        # Multiply by sqrt(d_model) as recommended in the Transformer paper to help with convergence
        return self.embedding(x) * math.sqrt(self.d_model)

## Positional Encoding

**Positional Encoding** : It is added to give the model information about the position of each word in a sequence.

Why the need of this? Because The word "ate" in "The cat ate the fish" is different from "ate" in "Ate the cat the fish?" — the order matters.

The positional Encoding are generated using the special encoding equation, where the sin is use for the even embedding values and cos is used for odd emedding values

The positional encoding for position `pos` and dimension `i` is defined as:

$$
\text{PE}_{(pos, 2i)} = \sin\left(\frac{pos}{10000^{\frac{2i}{d}}}\right)
$$

$$
\text{PE}_{(pos, 2i+1)} = \cos\left(\frac{pos}{10000^{\frac{2i}{d}}}\right)
$$

Where:
- \( pos \) is the position in the sequence,
- \( i \) is the dimension index,
- \( d \) is the total embedding dimension.

Sin and Cosine are the periodic functions who have their values between -1 and 1.

Why do we used them?

**Provide the unique patterns for each position**
1. The combination of sin and cos with different frequencies ensures that the each position has a unique encoding vector.
2. No two positions have the same encoding, and nearby positions have similar vectors, which helps the model recognize local context.

**Captures relative position information**
1. The sinusoidal form makes it easy for the model to learn the relative positions between words.
2. For example, PE(pos + k) can be expressed as a linear function of PE(pos), allowing the model to infer order differences like “word A is 2 steps ahead of word B”.

In [13]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    """
    Implements the sinusoidal positional encoding from the Transformer paper:
    "Attention is All You Need" (Vaswani et al. 2017).

    This adds information about token positions to the input embeddings, 
    enabling the model to capture order without recurrence.

    Args:
        d_model (int): Dimensionality of the model/embedding.
        max_seq_length (int): Maximum sequence length supported.
    """
    def __init__(self, d_model, max_seq_length):
        super().__init__()

        # Initialize a matrix of shape (max_seq_length, d_model)
        pe = torch.zeros(max_seq_length, d_model)

        # Position indices (0 to max_seq_length-1) shaped as (max_seq_length, 1)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        # Compute the div_term (frequency) for the sinusoidal functions.
        # Only half (every 2nd dim) because sin and cos alternate over even and odd dims
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply sine to even indices in the array; 2i
        pe[:, 0::2] = torch.sin(position * div_term)

        # Apply cosine to odd indices in the array; 2i+1
        pe[:, 1::2] = torch.cos(position * div_term)

        # Register as a buffer (non-learnable), adds a batch dimension for broadcasting
        self.register_buffer('pe', pe.unsqueeze(0))  # shape: (1, max_seq_length, d_model)

    def forward(self, x):
        """
        Adds positional encoding to the input tensor.

        Args:
            x (Tensor): Input of shape (batch_size, seq_len, d_model)

        Returns:
            Tensor: Positionally encoded input of the same shape
        """
        # Add positional encoding to the input
        return x + self.pe[:, :x.size(1)]

## Multihead Attention

Before that, What is self Attention?

Self Attention is what enables the transformers to identify the relationship between tokens and to determine and focus on the most relevant ones. It allows a model to look at other positions in the same input sequence when encoding a word — hence the name "self" attention.

Self-attention determines:
“Which other words in the sentence should I pay attention to when understanding this word?”\

Example:
Take the sentence:

“The cat sat on the mat because it was warm.”

To understand what “it” refers to, self-attention helps the model focus on “cat” or “mat” rather than every word equally. The model figures this out on its own during training.

We know that each input word is converted into the embedding right? So then each embedding is project into the three different matrices known as Q, K and V.

Q : Query (indicates what each "token" is looking for in another token)

V : Value (Actual content to be aggregated or weighted)

K : Key (Represents the content of each token that other token might find relevant )

using seprate linear transformations with learned weights.

🧠 Analogy: Job Search Example
Imagine you're trying to hire someone:

    - Your Query (Q) is the job requirement.

    - Each candidate has a Key (K) = their resume.

    - The actual Value (V) is what you’d get if you hired them.

You compare your Query to all the Keys (resumes) to get scores, then use those scores to weigh the Values (candidates’ actual skills).

Values are based on the attention-scores, which are computed by doing the dot-product of the Key and Query matrices
So, Attention scores = Q-K similarity(dot - product) from where we get the attention scores(n*n)

From the attention scores we apply the Softmax to get the attention weights.

so, below is the clear image to show


Example : 
"orange is my favorite fruit," the tokens "favorite" and "fruit" receive the highest attention when processing "orange," as they directly influence its context and meaning. The model interprets "orange" as a favored fruit rather than a color or other meaning.

### ⚙️ Step-by-step:

#### 1. Compute Dot Products:  $Q \cdot K^T$

This gives a score of how much attention word A should pay to word B.

#### 2. Scale and Apply Softmax:

$$
\text{Attention\_weights} = \text{softmax} \left( \frac{QK^T}{\sqrt{d_k}} \right)
$$

This normalizes the scores into probabilities.

#### 3. Multiply with V:

$$
\text{Output} = \text{Attention\_weights} \cdot V
$$

Each word’s final output is a **weighted sum of all the Value vectors**, based on attention.


### Multi Attention head

Multi-Head Attention is an advanced form of self-attention used in Transformers. Instead of calculating just one set of attention outputs (with one Q/K/V), it creates multiple "attention heads" — each learning different relationships or features in the input.

⚙️ Why do we need Multi-Head Attention?

A single self-attention layer may focus too narrowly. With multi-head attention:

- Each head looks at the sequence from a different perspective.

- Some heads may learn syntax (e.g., subject-verb links), others learn semantics (e.g., coreference, word meaning).

- This makes the model much more expressive.

The resulting embeddings capture token meaning, positional encoding, and contextual relationships.|

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    """
    Multi-head self-attention mechanism as described in the "Attention is All You Need" paper.

    Args:
        d_model (int): Total dimensionality of the model.
        num_heads (int): Number of parallel attention heads.
    """
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.num_heads = num_heads
        self.d_model = d_model
        self.head_dim = d_model // num_heads  # Dimension per head

        # Linear transformations for query, key, and value (no bias for attention projection)
        self.query_linear = nn.Linear(d_model, d_model, bias=False)
        self.key_linear = nn.Linear(d_model, d_model, bias=False)
        self.value_linear = nn.Linear(d_model, d_model, bias=False)

        # Final linear layer after concatenating all heads
        self.output_linear = nn.Linear(d_model, d_model)

    def split_heads(self, x, batch_size):
        """
        Split the embedding into multiple heads.

        Args:
            x (Tensor): shape (batch_size, seq_len, d_model)

        Returns:
            Tensor: shape (batch_size, num_heads, seq_len, head_dim)
        """
        seq_length = x.size(1)
        x = x.reshape(batch_size, seq_length, self.num_heads, self.head_dim)
        return x.permute(0, 2, 1, 3)  # move num_heads before seq_len

    def compute_attention(self, query, key, value, mask=None):
        """
        Compute scaled dot-product attention.

        Returns:
            context vector after attention, shape: (batch_size, num_heads, seq_len, head_dim)
        """
        # Shape: (batch_size, num_heads, seq_len, seq_len)
        scores = torch.matmul(query, key.transpose(-2, -1)) / (self.head_dim ** 0.5)

        # Apply mask (if provided): mask shape should match scores
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attention_weights = F.softmax(scores, dim=-1)  # softmax along last dimension
        return torch.matmul(attention_weights, value)  # context

    def combine_heads(self, x, batch_size):
        """
        Combine the heads back to a single tensor.

        Args:
            x (Tensor): shape (batch_size, num_heads, seq_len, head_dim)

        Returns:
            Tensor: shape (batch_size, seq_len, d_model)
        """
        x = x.permute(0, 2, 1, 3).contiguous()  # (batch_size, seq_len, num_heads, head_dim)
        return x.view(batch_size, -1, self.d_model)  # combine last two dims

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        # Apply linear transformations
        query = self.query_linear(query)
        key = self.key_linear(key)
        value = self.value_linear(value)

        # Split into heads
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        # Apply attention on all heads
        attn_output = self.compute_attention(query, key, value, mask)

        # Combine heads and pass through final linear layer
        output = self.combine_heads(attn_output, batch_size)

        return self.output_linear(output)

## FeedForward SubLayer

After the Multi-Head Attention layer in a Transformer block, there's a FeedForward Neural Network (FFN) layer, also known as the FeedForward SubLayer. It adds non-linearity and transformation to each token independently.

📌 Why It’s Used

While attention layers let tokens communicate, the FFN lets each token transform itself — enriching its internal representation after it has “heard” from others.

Our FeedForwardSublayer class contains two fully connected linear layers separated by a ReLU activation. 

Notice we use a dimension d_ff between linear layers, typically different from the embedding dimension used throughout the model to further facilitate capturing complex patterns. The forward method applies the forward pass to the attention mechanism outputs, passing them through the layers.


In [15]:
import torch.nn as nn

class FeedForwardSubLayer(nn.Module):
    """
    Position-wise Feed-Forward Network used in Transformer blocks.

    Args:
        d_model (int): Input and output dimensionality (same as the embedding size).
        d_ff (int): Hidden dimensionality (usually larger, e.g., 2048 in original paper).

    Architecture:
        FFN(x) = max(0, xW1 + b1)W2 + b2
               = fc2(ReLU(fc1(x)))
    """
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)   # First linear transformation (expands dimension)
        self.relu = nn.ReLU()                 # Activation function
        self.fc2 = nn.Linear(d_ff, d_model)   # Second linear transformation (projects back to d_model)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))  # Apply FFN to each position independently


## Encoder Layer
A Transformer Encoder Layer is a single block in the stack of encoder blocks used in models like BERT, GPT (decoder-only variant), and the original Transformer. Each layer processes a sequence of tokens to build richer, context-aware representations.

Encoder-only transformers simplify this architecture to place greater emphasis on understanding and representing the input data, such as text classification. 

They have two main components: 
- Each encoder layer incorporates a multi-head self-attention mechanism to capture relationships between tokens in the sequence

- followed by feed-forward sublayers to map this knowledge into abstract, nonlinear representations. Both elements are usually combined with other techniques like layer normalizations and dropouts to improve training.

In [16]:
class EncoderLayer(nn.Module):
    """
    A single Transformer encoder block.

    Consists of:
    1. Multi-head self-attention layer with residual connection + LayerNorm
    2. Position-wise feed-forward network with residual connection + LayerNorm

    Args:
        d_model (int): Input/output embedding dimension.
        num_heads (int): Number of attention heads.
        d_ff (int): Hidden layer size in the feed-forward network.
        dropout (float): Dropout rate applied after attention and FFN.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()

        # Multi-head self-attention
        self.attn = MultiHeadAttention(d_model, num_heads)

        # Position-wise feed-forward network
        self.ff_sublayer = FeedForwardSubLayer(d_model, d_ff)

        # Layer normalizations for residual connections
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, src_mask=None):
        """
        Args:
            x: Tensor of shape (batch_size, seq_len, d_model)
            src_mask: Optional mask for self-attention (batch_size, seq_len, seq_len)

        Returns:
            Tensor of shape (batch_size, seq_len, d_model)
        """

        # === Sublayer 1: Multi-Head Self-Attention ===
        attn_output = self.attn(x, x, x, src_mask)  # Q = K = V = x
        x = self.norm1(x + self.dropout(attn_output))  # Add & Norm

        # === Sublayer 2: Feed-Forward ===
        ff_output = self.ff_sublayer(x)
        x = self.norm2(x + self.dropout(ff_output))  # Add & Norm

        return x

In [17]:
class TransformerEncoder(nn.Module):
    """
    Full Transformer Encoder stack composed of:
    - Input token embeddings
    - Positional encodings
    - N stacked encoder layers

    Args:
        vocab_size (int): Size of the input vocabulary.
        d_model (int): Embedding dimension.
        num_layers (int): Number of encoder layers to stack.
        num_heads (int): Number of attention heads in each layer.
        d_ff (int): Hidden layer size in feed-forward network.
        dropout (float): Dropout rate for regularization.
        max_seq_length (int): Maximum input sequence length.
    """
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_seq_length):
        super().__init__()

        # Token embedding + positional encoding
        self.embedding = InputEmbeddings(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        # Stack of N encoder layers
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])

    def forward(self, x, src_mask=None):
        """
        Args:
            x (Tensor): Input token IDs, shape (batch_size, seq_len)
            src_mask (Tensor or None): Attention mask, shape (batch_size, seq_len, seq_len)

        Returns:
            Tensor: Encoded representation, shape (batch_size, seq_len, d_model)
        """
        # Embed token IDs and add positional information
        x = self.embedding(x)                          # (batch_size, seq_len, d_model)
        x = self.positional_encoding(x)                # (batch_size, seq_len, d_model)

        # Pass through each encoder layer
        for layer in self.layers:
            x = layer(x, src_mask)

        return x

In [18]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        # Define cross-attention and a third layer normalization
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ff_sublayer = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, y, tgt_mask, cross_mask):
        self_attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn_output))
        # Complete the forward pass
        cross_attn_output = self.cross_attn(x, y, y, cross_mask)
        x = self.norm2(x + self.dropout(cross_attn_output))
        ff_output = self.ff_sublayer(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [19]:
import torch

seq_length = 32

# Create an upper triangular matrix filled with -inf (mask), 0 elsewhere
tgt_mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool()

# Invert the mask if your attention mask expects 1s for valid tokens
# This version sets True for allowed tokens (lower triangle including diagonal)
causal_mask = ~tgt_mask  # optional depending on your attention implementation

In [20]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_seq_length):
        super(TransformerDecoder, self).__init__()
        self.embedding = InputEmbeddings(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        # Define the list of decoder layers and linear layer
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        # Define a linear layer to project hidden states to likelihoods
        self.fc = nn.Linear(d_model, vocab_size)
  
    def forward(self, x, encoder_output, tgt_mask=None, cross_mask=None):
        """
        Args:
            x: Target input token IDs (batch_size, tgt_seq_len)
            encoder_output: Output from encoder (batch_size, src_seq_len, d_model)
            tgt_mask: Mask for decoder self-attention (batch_size, tgt_seq_len, tgt_seq_len)
            cross_mask: Mask for cross-attention (batch_size, tgt_seq_len, src_seq_len)
        Returns:
            Logits over vocabulary (batch_size, tgt_seq_len, vocab_size)
        """
        x = self.embedding(x)
        x = self.positional_encoding(x)
        
        for layer in self.layers:
            x = layer(x, encoder_output, tgt_mask, cross_mask)
        
        x = self.fc(x)
        return F.log_softmax(x, dim=-1)


## Encoder-Decoder Attention

The encoder-decoder attention mechanism serves as the bridge that connects the
encoder and the decoder, facilitating the transfer of contextual information from
the source sequence to the target sequence.

#### Cross Attention Mechanism
It occurs in each decoder layer after the masked attention, taking two inputs: 
- The Queries (Q) are derived from the decoder’s current state.
- The Keys (K) and Values (V) come from the encoder’s output.

This setup allows the decoder to attend to relevant parts of the input sequence
while generating each token in the output. As a result, the model can learn
complex dependencies between source and target tokens.

✅ Source Sentence (English):
"I really like to travel."

🎯 Target Translation (Spanish in progress):
"Me gusta Mucho  "
Now the decoder is trying to predict the next word.

#### 1️⃣ Encoder Processes the Source
The encoder reads:
"I really like to travel"
and stores contextual information (hidden states) for each word:

"I"

"really"

"like"

"to"

"travel"

This information becomes the Key (K) and Value (V) vectors.

#### 2️⃣ Decoder Has Generated So Far:
"Me gusta mucho "
(which means “I like very much…”)

Now the decoder must predict the next word, ideally:
➡️ "viajar" (Spanish for “to travel”)

#### 3️⃣ How Encoder-Decoder Attention Helps:
The decoder forms a Query (Q) using its current hidden state — influenced by:
"Me gusta mucho"

It performs attention over the encoder’s K & V vectors — specifically looking at which source word is most relevant right now.

It attends most strongly to:

the English word: "travel"

Based on this, it predicts:
✅ "viajar"

In [21]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder = TransformerEncoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_seq_length)
        self.decoder = TransformerDecoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_seq_length)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, cross_mask=None):
        """
        Args:
            src: Source input IDs (batch_size, src_seq_len)
            tgt: Target input IDs (batch_size, tgt_seq_len)
        Returns:
            Decoder output logits
        """
        encoder_output = self.encoder(src, src_mask)
        decoder_output = self.decoder(tgt, encoder_output, tgt_mask, cross_mask)
        return decoder_output

In [22]:
from torch.utils.data import random_split
batch_size = 16

dataset = SummarizationDataset(
    headline=df['headline'].tolist(),
    short_description=df['short_description'].tolist(),
    word2idx=word2idx,
    max_len=64  # or your preferred length
)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [23]:
for batch in train_loader:
    input_ids = batch['input_ids']
    target_ids = batch['target_ids']
    print("Train batch input shape:", input_ids.shape)
    break

for batch in test_loader:
    input_ids = batch['input_ids']
    target_ids = batch['target_ids']
    print("Test batch input shape:", input_ids.shape)
    break

Train batch input shape: torch.Size([16, 64])
Test batch input shape: torch.Size([16, 64])


In [24]:
!pip install nltk rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=04215e1ccc54360b7e23c5c16a7c7670b5f970c63e8286bf86a996537232affe
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [25]:
import torch
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
def decode_tokens(token_ids, idx2word, eos_token='<PAD>'):
    words = []
    for idx in token_ids:
        word = idx2word.get(idx.item(), '<UNK>')
        if word == eos_token:
            break
        words.append(word)
    return ' '.join(words)

def greedy_decode(model, src, word2idx, max_len=64):
    model.eval()
    src = src.to(device)
    encoder_output = model.encoder(src)

    ys = torch.ones((src.size(0), 1), dtype=torch.long).fill_(word2idx['<PAD>']).to(device)

    for i in range(max_len - 1):
        tgt_mask = torch.triu(torch.ones(ys.size(1), ys.size(1)), diagonal=1).bool().to(device)
        tgt_mask = ~tgt_mask
        out = model.decoder(ys, encoder_output, tgt_mask, cross_mask=None)
        next_token = out[:, -1, :].argmax(dim=-1).unsqueeze(1)
        ys = torch.cat([ys, next_token], dim=1)

    return ys

## Evaluate the Model

In [27]:
def evaluate_model(model, dataloader, idx2word, word2idx, criterion, max_len=64):
    model.eval()
    running_loss = 0.0
    total_tokens = 0
    bleu_scores = []
    rouge_l_scores = []
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method1

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            src = batch['input_ids'].to(device)
            tgt = batch['target_ids'].to(device)

            decoder_input = tgt[:, :-1]
            target_output = tgt[:, 1:]

            seq_len = decoder_input.size(1)
            tgt_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(device)
            tgt_mask = ~tgt_mask

            outputs = model(src, decoder_input, src_mask=None, tgt_mask=tgt_mask, cross_mask=None)

            loss = criterion(outputs.view(-1, outputs.size(-1)), target_output.reshape(-1))

            running_loss += loss.item() * target_output.numel()
            total_tokens += target_output.numel()

            # Greedy decode for BLEU/ROUGE (optional: can be slow, so sample a few batches)
            generated_ids = greedy_decode(model, src, word2idx, max_len)

            for pred_seq, true_seq in zip(generated_ids, tgt):
                pred_text = decode_tokens(pred_seq, idx2word)
                true_text = decode_tokens(true_seq[1:], idx2word)  # skip <PAD>/<BOS>

                reference = [nltk.word_tokenize(true_text)]
                candidate = nltk.word_tokenize(pred_text)

                bleu = sentence_bleu(reference, candidate, smoothing_function=smooth_fn)
                bleu_scores.append(bleu)

                rouge = scorer.score(true_text, pred_text)['rougeL'].fmeasure
                rouge_l_scores.append(rouge)

    avg_loss = running_loss / total_tokens
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = sum(rouge_l_scores) / len(rouge_l_scores)

    return avg_loss, avg_bleu, avg_rouge

In [28]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Transformer(
    vocab_size=len(word2idx),
    d_model=512,
    num_heads=8,
    num_layers=2,
    d_ff=2048,
    max_seq_length=64,  # match with dataset
    dropout=0.1
).to(device)

pad_idx = word2idx['<PAD>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

## Training the Model

In [29]:
# Training + evaluation loop
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    total_tokens = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

    for batch in loop:
        input_ids = batch['input_ids'].to(device)    # (B, S)
        target_ids = batch['target_ids'].to(device)  # (B, S)

        decoder_input = target_ids[:, :-1]
        target_output = target_ids[:, 1:]

        seq_len = decoder_input.size(1)
        tgt_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(device)
        tgt_mask = ~tgt_mask

        outputs = model(input_ids, decoder_input, src_mask=None, tgt_mask=tgt_mask, cross_mask=None)

        loss = criterion(outputs.view(-1, outputs.size(-1)), target_output.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * target_output.numel()
        total_tokens += target_output.numel()

        loop.set_postfix(loss=loss.item())

    epoch_loss = running_loss / total_tokens
    print(f"Epoch {epoch+1}/{num_epochs} — Train Loss: {epoch_loss:.4f}")

    # Evaluate on validation/test set
    val_loss, val_bleu, val_rouge = evaluate_model(model, test_loader, idx2word, word2idx, criterion, max_len=64)
    print(f"Epoch {epoch+1} — Val Loss: {val_loss:.4f}, BLEU: {val_bleu:.4f}, ROUGE-L: {val_rouge:.4f}")

                                                                        

Epoch 1/5 — Train Loss: 7.1666


                                                             

Epoch 1 — Val Loss: 6.6670, BLEU: 0.0000, ROUGE-L: 0.0000


                                                                         

Epoch 2/5 — Train Loss: 6.1833


                                                             

Epoch 2 — Val Loss: 6.3771, BLEU: 0.0000, ROUGE-L: 0.0000


                                                                         

Epoch 3/5 — Train Loss: 5.6400


                                                             

Epoch 3 — Val Loss: 6.2364, BLEU: 0.0000, ROUGE-L: 0.0000


                                                                         

Epoch 4/5 — Train Loss: 5.1850


                                                             

Epoch 4 — Val Loss: 6.1972, BLEU: 0.0000, ROUGE-L: 0.0000


                                                                         

Epoch 5/5 — Train Loss: 4.7525


                                                             

Epoch 5 — Val Loss: 6.2321, BLEU: 0.0000, ROUGE-L: 0.0000




## Inference 

In [30]:
def greedy_decode(model, src, word2idx, idx2word, max_len=64, device='cpu'):
    """
    Generate summary tokens greedily from the encoder output.
    """
    model.eval()
    src = src.to(device)

    # Encode source sequence
    encoder_output = model.encoder(src)

    # Start with a meaningful token - try using the most common word or create a BOS token
    # For now, let's start with the first non-PAD token from vocabulary
    start_token = 2  # Skip <PAD>=0 and <UNK>=1, start with first real word
    ys = torch.tensor([[start_token]], dtype=torch.long).to(device)

    generated_tokens = []
    
    for _ in range(max_len - 1):
        seq_len = ys.size(1)
        tgt_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(device)
        tgt_mask = ~tgt_mask

        # Decode step
        with torch.no_grad():
            out = model.decoder(ys, encoder_output, tgt_mask, cross_mask=None)
            prob = out[:, -1, :]  # Get last token's predictions
            next_word = prob.argmax(dim=-1).unsqueeze(1)

        ys = torch.cat([ys, next_word], dim=1)
        
        token_id = next_word.item()
        word = idx2word.get(token_id, '<UNK>')
        
        # Stop if we hit PAD or if we're repeating PAD
        if word == '<PAD>' and len(generated_tokens) > 0:
            break
            
        if word not in ['<PAD>', '<UNK>']:
            generated_tokens.append(word)

    return ' '.join(generated_tokens) if generated_tokens else "no output generated"

In [31]:
for i in range(10):
    article_text = df['short_description'][i]
    reference_headline = df['headline'][i]  # actual cleaned summary text

    input_ids = torch.tensor(encode_text(article_text, word2idx, max_len=64)).unsqueeze(0)  # (1, max_len)
    predicted_headline = greedy_decode(model, input_ids, word2idx, idx2word, max_len=32, device=device)

    print(f"Example {i+1}")
    print("Actual Headline:")
    print(reference_headline)
    print("\nPredicted Headline:")
    print(predicted_headline)
    print('-' * 80)

Example 1
Actual Headline:
over 4 million americans roll up sleeves for omicrontargeted covid boosters

Predicted Headline:
us military ban on syria strike group in yemen war <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
--------------------------------------------------------------------------------
Example 2
Actual Headline:
american airlines flyer charged banned for life after punching flight attendant on video

Predicted Headline:
trump administration seeks to fire robert mueller probe <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
--------------------------------------------------------------------------------
Example 3
Actual Headline:
23 of the funniest tweets about cats and dogs this week sept 1723

Predicted Headline:
the funniest tweets from parents this week sept 1723 <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EO