In [1]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e8252e111b2c772f767104f5673e306daece41d206cee736a95d96cf3b190229
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [2]:
import os
import gc
import logging
import shutil
import sqlite3

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from transformers import BertTokenizer

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm, trange
from numba import cuda

In [4]:
from rouge_score import rouge_scorer

Download **Amazon Fine Food dataset** from Stanford Website https://snap.stanford.edu/data/web-FineFoods.html

In [None]:
def get_dataset():
    SQLITE_FILE_PATH = "./database.sqlite"
    conn = sqlite3.connect(SQLITE_FILE_PATH)
    df = pd.read_sql_query("SELECT * FROM Reviews",conn)
    print(f'Shape : {df.shape}')
    print(f'Columns : {df.columns.values}')
    conn.close()

    return df

df_orig = get_dataset()

Shape : (568454, 10)
Columns : ['Id' 'ProductId' 'UserId' 'ProfileName' 'HelpfulnessNumerator'
 'HelpfulnessDenominator' 'Score' 'Time' 'Summary' 'Text']


In [7]:
df_orig.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [8]:
# cuda.select_device(0)
# cuda.close()

In [9]:
class Logger():
    LOG_LEVEL = logging.INFO
    def debug(self, s):
        if logging.DEBUG >= self.LOG_LEVEL :
            print(s)
    def info(self, s):
        if logging.INFO >= self.LOG_LEVEL :
            print(s)

logger = Logger()

In [10]:
Logger.LOG_LEVEL = logging.DEBUG

In [11]:
Logger.LOG_LEVEL = logging.INFO

In [12]:
# Clear CUDA cache and set device
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [13]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Add special tokens for summarization
tokenizer.add_special_tokens({
    'additional_special_tokens': ['<summary>', '</summary>']
})

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

2

In [None]:
def pad_tensor(tensor,target_seq_len, pad_value=0.0):

    if len(tensor.shape) != 3:
      print("Error: Input tensor must have 3 dimensions (batch_size, seq_len, embedding_dim)")
      return None

    batch_size, seq_len, embedding_dim = tensor.shape

    if seq_len >= target_seq_len:
        return tensor[:, :target_seq_len, :]  # Truncate if already larger

    padding_size = target_seq_len - seq_len
    padding = torch.full((batch_size, padding_size, embedding_dim), pad_value, dtype=tensor.dtype, device=tensor.device)
    padded_tensor = torch.cat((tensor, padding), dim=1)

    return padded_tensor

# (16,300,512) -> (16,512,512)

In [15]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers,
                           batch_first=True, bidirectional=True, dropout=dropout)
        # Add linear layer to reduce bidirectional hidden state
        self.reduce_h = nn.Linear(hidden_size * 2, hidden_size)
        self.reduce_c = nn.Linear(hidden_size * 2, hidden_size)

    def forward(self, x, seq_lens):
        logger.debug('=== Encoder BEGINS ===')
        logger.debug(f'x : {x.shape} - seq_lens : {seq_lens.shape}') # (16,512) (16,)

        embedded = self.embedding(x)
        logger.debug(f'embedded.size(): {embedded.size()}') # (16,512,256)

        embed_seq_len = embedded.size()[1] # 512

        packed = pack_padded_sequence(embedded, seq_lens.cpu(), batch_first=True, enforce_sorted=False)
        logger.debug(f'packed  : {type(packed)} - packed.data.shape {packed.data.shape} - packed.batch_sizes.shape {packed.batch_sizes.shape}')
        # (7475,256) (512,)
        output, (hidden, cell) = self.lstm(packed)
        logger.debug(f'output {output.data.shape} {output.batch_sizes.shape} - hidden.size() {hidden.size()} - cell.size() {cell.size()}')
        # ((7475,512)) (4,16,256) (4,16,256)
        output, _ = pad_packed_sequence(output, batch_first=True)
        logger.debug(f'output.size() {output.size()}') # (16,512,512)

        # Padding
        output = pad_tensor(output, target_seq_len = embed_seq_len, pad_value = tokenizer.pad_token_id)
        logger.debug(f'output.size() after padding {output.size()}')

        # Reduce bidirectional hidden states
        hidden = hidden.view(self.lstm.num_layers, 2, -1, self.lstm.hidden_size)
        logger.debug(f'hidden.size() {hidden.size()}') # (2,2,16,256)
        cell = cell.view(self.lstm.num_layers, 2, -1, self.lstm.hidden_size)
        logger.debug(f'cell.size() {cell.size()}') # (2,2,16,256)

        hidden = torch.cat([hidden[:, 0], hidden[:, 1]], dim=2)
        logger.debug(f'hidden.size() {hidden.size()}') # (2,16,512)
        cell = torch.cat([cell[:, 0], cell[:, 1]], dim=2)
        logger.debug(f'cell.size() {cell.size()}') # (2,16,512)

        hidden = self.reduce_h(hidden)
        logger.debug(f'hidden.size() {hidden.size()}') # (2,16,256)
        cell = self.reduce_c(cell)
        logger.debug(f'cell.size() : {cell.size()}') # (2,16,256)

        logger.debug(f'output.size() {output.size()} - hidden.size() {hidden.size()} - cell.size() {cell.size()}')
        # (16,512,512) (2,16,256) (2,16,256)
        logger.debug('=== Encoder ENDS ===')
        return output, hidden, cell

In [16]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(hidden_size * 3, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs, mask):
        logger.debug('=== Attention BEGINS ===')
        batch_size, seq_len, hidden_size = encoder_outputs.shape
        logger.debug(f'hidden {hidden.size()} - encoder_outputs {encoder_outputs.size()} - mask {mask.size()}')
        # (16,256,) (16,512,512) (16,512)
        logger.debug(f'batch_size {batch_size} - seq_len {seq_len} - hidden_size {hidden_size}')
        # 16, 512, 512

        # Repeat hidden states for each timestep
        hidden = hidden.repeat(seq_len, 1, 1).transpose(0, 1)
        logger.debug(f'hidden {hidden.size()}') # (16,512,512) -> (512,16,256) -> (16,512,256)

        # Concatenate hidden states and encoder outputs to compute energy scores
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        logger.debug(f'energy {energy.size()}') # (16,512,256) concat (16,512,512) -> (16,512,768) attn-> (16,512,256)

        # Compute attention scores
        v = self.v.repeat(batch_size, 1).unsqueeze(1) # (256,) -> (16,256) -> (16,1,256)
        logger.debug(f'v {v.size()}')
        attention = torch.bmm(energy, v.transpose(1, 2)).squeeze(2)
        logger.debug(f'attention {attention.size()}') # (16,512)

        # Apply mask: set attention score to -inf where the mask is 0 (padding)
        attention = attention.masked_fill(mask == 0, float('-inf'))
        logger.debug(f'attention {attention.size()}') # (16,512)

        logger.debug('=== Attention ENDS ===')
        # Return softmaxed attention scores
        return F.softmax(attention, dim=1)

In [17]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size + hidden_size * 2, hidden_size, num_layers,
                           batch_first=True, dropout=dropout)
        self.attention = Attention(hidden_size)

        # Pointer-generator components
        self.w_h = nn.Linear(hidden_size * 2, 1)
        self.w_s = nn.Linear(hidden_size, 1)
        self.w_x = nn.Linear(embed_size, 1)

        # Output projection
        self.out = nn.Linear(hidden_size * 3, vocab_size)

    def forward(self, input_seq, last_hidden, last_cell, encoder_outputs, src_mask, src):
        # (16,1) (2,16,256) (2,16,256) (16,512,512) (16,512) (16,512)
        logger.debug('=== Decoder BEGINS ===')

        logger.debug(f'input_seq {input_seq.size()} - last_hidden {last_hidden.size()} - last_cell {last_cell.size()} - encoder_outputs {encoder_outputs.size()} - src_mask {src_mask.size()} - src {src.size()}')
        # Embed the input sequence
        embedded = self.embedding(input_seq) # (16,1) -> (16,1,256)
        logger.debug(f'embedded.size() {embedded.size()}') # (16,1,256)

        # Get attention weights using the mask and encoder outputs
        attention_weights = self.attention(last_hidden[-1], encoder_outputs, src_mask)
        logger.debug(f'attention_weights {attention_weights.size()}') # (16,512)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs) # (16,1,512) x (16,512,512) -> (16,1,512)
        logger.debug(f'context {context.size()}') # (16,1,512)

        # Combine embedded input with context vector
        lstm_input = torch.cat((embedded, context), dim=2)
        logger.debug(f'lstm_input {lstm_input.shape}') # (16,1,768)
        output, (hidden, cell) = self.lstm(lstm_input, (last_hidden, last_cell))
        logger.debug(f'output {output.shape} - hidden {hidden.shape} - cell {cell.shape}') # (16,1,256) (2,16,256) (2,16,256)

        # Calculate pointer generation probability
        p_gen = torch.sigmoid(
            self.w_h(context) +
            self.w_s(hidden[-1].unsqueeze(1)) +
            self.w_x(embedded)
        )

        logger.debug(f'p_gen {p_gen.shape}') # (16,1,1)

        # Vocabulary distribution
        output = torch.cat((output, context), dim=2)
        logger.debug(f'output {output.shape}') # (16,1,768)
        vocab_dist = F.softmax(self.out(output), dim=2)
        logger.debug(f'vocab_dist {vocab_dist.shape}') # (16,1,30522)

        # Attention distribution
        attn_dist = attention_weights.unsqueeze(1)
        logger.debug(f'attn_dist {attn_dist.shape}') # (16,1,512)

        # Create the extended attention distribution with the mask
        extended_attn_dist = torch.zeros_like(vocab_dist).scatter_add_(
            dim=2, index=src.unsqueeze(1).expand_as(attn_dist), src=attn_dist
        )
        logger.debug(f'extended_attn_dist {extended_attn_dist.shape}') # (16,1,30522)

        # Final distribution (combining vocabulary and attention distributions)
        final_dist = p_gen * vocab_dist + (1 - p_gen) * extended_attn_dist
        logger.debug(f'final_dist {final_dist.shape}') # (16,1,30522)

        logger.debug(f'final_dist {final_dist.shape} - hidden {hidden.shape} - cell {cell.shape} - attention_weights {attention_weights.shape} ')
        # (16,1,30522) (2,16,256) (2,16,256) (16,512)

        logger.debug('=== Decoder ENDS ===')
        return final_dist, hidden, cell, attention_weights

In [18]:
class PointerGeneratorNetwork(nn.Module):
    def __init__(self, vocab_size, embed_size=256, hidden_size=256, num_layers=2, dropout=0.1, sos_token=1):
        super().__init__()
        self.encoder = Encoder(vocab_size, embed_size, hidden_size, num_layers, dropout)
        self.decoder = Decoder(vocab_size, embed_size, hidden_size, num_layers, dropout)
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.sos_token = sos_token  # Set the start-of-sequence token

    def forward(self, src, tgt, src_lens, teacher_forcing_ratio=0.5):
        logger.debug('=== PointerGeneratorNetwork BEGINS ===')
        batch_size = src.shape[0]
        max_len = tgt.shape[1]
        vocab_size = self.decoder.vocab_size
        logger.debug(f'batch_size {batch_size} - max_len {max_len} - vocab_size {vocab_size}')
        # 16, 128, 30522

        outputs = torch.zeros(batch_size, max_len, vocab_size).to(src.device)
        logger.debug(f'outputs {outputs.size()}') # (16,128,30522)

        # Create source mask
        src_mask = torch.zeros(src.shape).to(src.device)
        logger.debug(f'src_mask {src_mask.size()}') # (16,512)
        for idx, length in enumerate(src_lens):
            src_mask[idx, :length] = 1
        logger.debug(f'src_mask {src_mask.size()}') # (16,512)

        # Encoder forward pass
        encoder_outputs, hidden, cell = self.encoder(src, src_lens)
        logger.debug(f'encoder_outputs {encoder_outputs.size()} - hidden {hidden.size()} - cell {cell.size()}')
        # (16,512,512, (4,16,256) (4,16,256)

        # First input to decoder is SOS token
        decoder_input = torch.full((batch_size, 1), self.sos_token, dtype=torch.long, device=src.device)
        logger.debug(f'decoder_input {decoder_input.size()}') # (16,1)

        for t in range(1, max_len):
            # Decoder forward pass with src_mask
            output, hidden, cell, _ = self.decoder(
                decoder_input, hidden, cell, encoder_outputs, src_mask, src
            )

            logger.debug(f'output {output.size()} - hidden {hidden.size()} - cell {cell.size()}')
            # (16,1,30522) (2,16,256) (2,16,256)
            outputs[:, t:t+1] = output
            logger.debug(f'output {output.size()}') # (16,1,30522)

            # Teacher forcing decision
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            logger.debug(f'teacher_force {teacher_force}') # False
            top1 = output.argmax(2)
            logger.debug(f'top1 {top1.size()}') # (16,1)
            decoder_input = tgt[:, t].unsqueeze(1) if teacher_force else top1
            logger.debug(f'decoder_input {decoder_input.size()}') # (16,1)

        logger.debug(f'outputs {outputs.size()}') # (16,128,30522)
        logger.debug('=== PointerGeneratorNetwork ENDS ===')
        return outputs

In [19]:
# Data preprocessing functions
def clean_text(text):
    """Basic text cleaning"""
    text = text.strip()
    text = ' '.join(text.split())
    return text

def preprocess_text(text, max_length):
    """Preprocess and tokenize text"""
    text = clean_text(text)
    tokens = tokenizer.encode(
        text,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    return tokens.squeeze()

def prepare_data(df, max_src_len=512, max_tgt_len=128, batch_size=16):
    """Prepare dataset for training"""
    src_texts = df['article'].apply(clean_text).tolist()
    tgt_texts = df['highlights'].apply(clean_text).tolist()

    print("Tokenizing source texts...")
    src_tokens = [preprocess_text(text, max_src_len) for text in tqdm(src_texts)]
    print("Tokenizing target texts...")
    tgt_tokens = [preprocess_text(text, max_tgt_len) for text in tqdm(tgt_texts)]
    print("Obtaining source text lengths...")
    src_lens = torch.tensor([len(tokenizer.encode(text, max_length=max_src_len, truncation=True))
                            for text in tqdm(src_texts)])

    return torch.stack(src_tokens), torch.stack(tgt_tokens), src_lens

In [20]:
def generate_summary(model, src, src_lens, max_length=175):
    model.eval()  # Set the model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        # Pass the input through the encoder
        encoder_outputs, hidden, cell = model.encoder(src, src_lens)

        # Create source mask (1 for actual tokens, 0 for padding)
        # Make sure mask matches encoder output dimensions
        src_mask = torch.zeros(src.shape, device=device)
        for idx, length in enumerate(src_lens):
            src_mask[idx, :length] = 1

        # Reshape mask to match attention requirements (batch_size, seq_len)
        src_mask = src_mask[:, :encoder_outputs.size(1)]

        # Prepare for decoding
        batch_size = src.size(0)
        decoder_input = torch.full((batch_size, 1), model.sos_token,
                                 dtype=torch.long, device=device)

        generated_tokens = []
        max_length = min(max_length, src.size(1))  # Limit max_length to source sequence length

        for _ in range(max_length):
            # Ensure src matches encoder_outputs length
            src_input = src[:, :encoder_outputs.size(1)]

            # Get decoder output
            output, hidden, cell, _ = model.decoder(
                decoder_input,
                hidden,
                cell,
                encoder_outputs,
                src_mask,
                src_input
            )

            # Get the most likely next token
            top1 = output.argmax(2)
            next_token = top1[:, -1].item()
            generated_tokens.append(next_token)

            # Break if end of sequence token is generated
            if next_token == tokenizer.sep_token_id:
                break

            # Update decoder input for next iteration
            decoder_input = top1

        # Convert tokens to text
        decoded_summary = tokenizer.decode(generated_tokens,
                                         skip_special_tokens=True,
                                         clean_up_tokenization_spaces=True)

        return decoded_summary

In [21]:
def calc_rouge_scores(reference_summaries, generated_summaries):
    # from rouge_score import rouge_score

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)    
    rouge_scores = []
    for ref, gen in zip(reference_summaries, generated_summaries):
            scores = scorer.score(ref, gen)
            rouge_scores.append({
                'rouge1': scores['rouge1'].fmeasure,
                'rouge2': scores['rouge2'].fmeasure,
                'rougeL': scores['rougeL'].fmeasure
            })
    
        # Calculate average ROUGE scores
    avg_scores = {
            metric: np.mean([score[metric] for score in rouge_scores])
            for metric in ['rouge1', 'rouge2', 'rougeL']
        }
    
    print("\nAverage ROUGE scores:")
    print(f"ROUGE-1: {avg_scores['rouge1']:.6f}; ", end="")
    print(f"ROUGE-2: {avg_scores['rouge2']:.6f}; ", end="")
    print(f"ROUGE-L: {avg_scores['rougeL']:.6f}")

In [22]:
def evaluate_model_e2(train_data, test_data, batch_size, device):
    """
        uses calc_rogue_scores and generated_summary
    """
    reference_summaries = test_data['highlights'].tolist()
    generated_summaries = []
    for i in tqdm(range(0, len(test_data), batch_size), desc="Generating summaries"):
            batch_src = train_data[0][i:i+batch_size].to(device)
            batch_lens = train_data[2][i:i+batch_size]
    
            for j in range(len(batch_src)):
                summary = generate_summary(model, batch_src[j:j+1], batch_lens[j:j+1])
                generated_summaries.append(summary)
    
    print("Calculating ROUGE scores...")
    """
        reference_summaries and generated_summaries obtained
        so calculate rouge score
    """
    calc_rouge_scores(reference_summaries, generated_summaries)

In [None]:
# Training function
def train_model(model, train_data, test_data, optimizer, criterion, num_epochs=100, batch_size=16):

    for epoch in range(num_epochs):
        # set model to TRAIN mode
        model.train()
        epoch_loss = 0
        src_tokens, tgt_tokens, src_lens = train_data

        for batch_idx in tqdm(range(0, len(src_tokens), batch_size), desc=f"Epoch {epoch+1}"):
            batch_src = src_tokens[batch_idx:batch_idx+batch_size].to(device)
            batch_tgt = tgt_tokens[batch_idx:batch_idx+batch_size].to(device)
            batch_lens = src_lens[batch_idx:batch_idx+batch_size]
            logger.debug(f'batch_src {batch_src.size()} - batch_tgt {batch_tgt.size()} - batch_lens {batch_lens.size(0)}')
            # (16,512) (16,128) (16,)

            optimizer.zero_grad()

            output = model(batch_src, batch_tgt, batch_lens)
            logger.debug(f'output {output.size()}') # (16,128,30522)
            output = output[:, 1:].contiguous().view(-1, vocab_size) # (16,(128-1), 30522) -> (16*127,30522) -> (2032,30522)
            target = batch_tgt[:, 1:].contiguous().view(-1) # (2032,)
            logger.debug(f'output {output.size()} - target {target.size()}') # (2032,30522) (2032,)

            loss = criterion(output, target)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch+1} loss: {epoch_loss:.6f}")

        # set model to EVAL mode
        model.eval()
    # evaluate model by calculating rogue values
    evaluate_model_e2(train_data, test_data, batch_size, device)

In [24]:
# Load dataset
def load_dataset_dataframe(df:pd.DataFrame):
  df = df[["Text","Summary"]]
  df = df.rename(columns = {"Text":"article", "Summary": "highlights"})
  df = df.iloc[:4*(10**4)]
  split_index = int(df.shape[0] * 0.8)
  train_data = df.iloc[:split_index]
  test_data = df.iloc[split_index:]
  print("Train data shape", train_data.shape)
  print("Test data shape", test_data.shape)

  return train_data, test_data

train_data, test_data = load_dataset_dataframe(df_orig)

Train data shape (32000, 2)
Test data shape (8000, 2)


In [25]:
vocab_size = tokenizer.vocab_size
embed_size = 256
hidden_size = 256
num_layers = 2
batch_size = 16
sos_token = 2

In [26]:
# train_data = prepare_data(test_data, batch_size=batch_size)
train_data = prepare_data(train_data, max_src_len = 512, max_tgt_len = 16, batch_size = batch_size)

Tokenizing source texts...


  0%|          | 0/32000 [00:00<?, ?it/s]

Tokenizing target texts...


  0%|          | 0/32000 [00:00<?, ?it/s]

Obtaining source text lengths...


  0%|          | 0/32000 [00:00<?, ?it/s]

In [27]:
src_tokens, tgt_tokens, src_lens = train_data
print(src_tokens.shape, tgt_tokens.shape, src_lens.shape)
print(src_lens[2])

torch.Size([32000, 512]) torch.Size([32000, 16]) torch.Size([32000])
tensor(125)


In [28]:
model = PointerGeneratorNetwork(
        vocab_size=vocab_size,
        embed_size=embed_size,
        hidden_size=hidden_size,
        num_layers=num_layers,
    sos_token=sos_token
    ).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [29]:
TRAIN_MODEL_FOR_NUM_EPOCHS = 25
train_model(model, train_data, test_data, optimizer, criterion, num_epochs=TRAIN_MODEL_FOR_NUM_EPOCHS, batch_size=batch_size)

Epoch 1:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 1 loss: 20.342458


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 2:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 2 loss: 20.285334


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 3:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 3 loss: 20.151457


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 4:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 4 loss: 20.072641


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 5:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 5 loss: 19.884432


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 6:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 6 loss: 19.642591


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 7:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 7 loss: 19.324622


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 8:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 8 loss: 18.894869


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 9:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 9 loss: 18.294063


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 10:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 10 loss: 17.417737


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 11:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 11 loss: 16.067545


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 12:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 12 loss: 15.100180


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 13:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 13 loss: 13.828305


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 14:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 14 loss: 12.109293


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 15:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 15 loss: 11.401091


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 16:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 16 loss: 10.996748


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 17:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 17 loss: 9.706068


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 18:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 18 loss: 9.119828


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 19:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 19 loss: 7.671372


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 20:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 20 loss: 6.174455


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 21:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 21 loss: 5.391084


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 22:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 22 loss: 4.739317


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 23:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 23 loss: 4.047826


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 24:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 24 loss: 3.747027


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]



Epoch 25:   0%|          | 0/2000 [00:00<?, ?it/s]

Epoch 25 loss: 3.450163


Generating summaries:   0%|          | 0/500 [00:00<?, ?it/s]

Calculating ROUGE scores...

Average ROUGE scores:
ROUGE-1: 32.519042; ROUGE-2: 21.916302; ROUGE-L: 29.237135
