In [None]:
# install from repository otherwise we would get an error.
! pip install -Uq git+https://github.com/huggingface/transformers.git
! pip install -Uq git+https://github.com/huggingface/accelerate.git
!pip install -q torch_snippets

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch_snippets import *
from transformers import (T5Tokenizer,
                          T5ForConditionalGeneration,
                          )

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import pandas as pd

PATH = "/Users/fahimafridi/desktop/NLP/Final.csv"
df = pd.read_csv(PATH, encoding='latin-1')
df.head(3)

In [3]:
df = df.drop("ID", axis=1)
df["String"] = "string: " + df["String"]

In [4]:
from sklearn.model_selection import train_test_split

def clean_data(df):
    # lowercase the data
    df["String"] = df["String"].apply(lambda x: x.lower())
    df["Queries"] = df["Queries"].apply(lambda x: x.lower())
    # remove excess white space
    df["String"] = df["String"].apply(lambda x: " ".join(x.split()))
    return df

df = clean_data(df)

# split the dataset into train/validation
train_df, val_df = train_test_split(df, test_size=0.25)
train_df.shape, val_df.shape

((1133, 2), (378, 2))

In [5]:
!pip install --upgrade pip



In [6]:
!pip install sentencepiece



In [7]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

class ArticleSummaryDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        article = df["String"].iloc[index]
        summary = df["Queries"].iloc[index]
        
        source = self.tokenizer.batch_encode_plus(
            [article],
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        target = self.tokenizer.batch_encode_plus(
            [summary],
            add_special_tokens=True,
            max_length=100,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        article_ids = source['input_ids'].squeeze()
        article_masks = source['attention_mask'].squeeze()
        summary_ids = target['input_ids'].squeeze()
        summary_masks = target['attention_mask'].squeeze()
        return (
            article_ids.to(device, dtype=torch.long),
            article_masks.to(device, dtype=torch.long),
            summary_ids.to(device, dtype=torch.long),
            summary_masks.to(device, dtype=torch.long),
        )

tr_ds = ArticleSummaryDataset(train_df, tokenizer)
val_ds = ArticleSummaryDataset(val_df, tokenizer)

tr_dl = DataLoader(tr_ds, shuffle=True, batch_size=6)
val_dl = DataLoader(val_ds, shuffle=False, batch_size=6)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
from transformers import AdamW, get_linear_schedule_with_warmup

def train_batch(model, batch, optimizer):
    article_tokens = batch[0].to(device)
    article_masks = batch[1].to(device)
    summary_tokens = batch[2].to(device)
    summary_masks = batch[3].to(device)
    
    model.train()
        
    optimizer.zero_grad()

    outputs = model(input_ids=article_tokens,
                   attention_mask=article_masks,
                   labels=summary_tokens,
                   decoder_attention_mask=summary_masks)
    loss, prediction_scores = outputs[:2]

    loss.backward()
    optimizer.step()
    scheduler.step()

    return loss
        
@torch.no_grad()
def validate_batch(model, batch):
    article_tokens = batch[0].to(device)
    article_masks = batch[1].to(device)
    summary_tokens = batch[2].to(device)
    summary_masks = batch[3].to(device)
    
    model.eval()

    outputs = model(input_ids=article_tokens,
                   attention_mask=article_masks,
                   labels=summary_tokens,
                   decoder_attention_mask=summary_masks)
    loss, prediction_scores = outputs[:2]
    return loss

num_epochs = 6
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)
total_steps = len(tr_dl) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)
log = Report(num_epochs)
# train the model
for e in range(num_epochs):
    N = len(tr_dl)
    for i, batch in enumerate(tr_dl):
        loss = train_batch(model, batch, optimizer)
        log.record(e+(i+1)/N, trn_loss=loss, end="\r")
    
    N = len(val_dl)
    for i, batch in enumerate(val_dl):
        loss = validate_batch(model, batch)
        log.record(e+(i+1)/N, val_loss=loss, end="\r")
    log.report_avgs(e+1)



EPOCH: 1.000  val_loss: 0.636  trn_loss: 2.187  (394.59s - 1972.93s remaining)
EPOCH: 2.000  val_loss: 0.452  trn_loss: 0.880  (834.91s - 1669.82s remaining)
EPOCH: 3.000  val_loss: 0.385  trn_loss: 0.677  (1204.05s - 1204.05s remaining)
EPOCH: 4.000  val_loss: 0.350  trn_loss: 0.590  (1573.32s - 786.66s remaining)
EPOCH: 5.000  val_loss: 0.334  trn_loss: 0.550  (1950.13s - 390.03s remaining)
EPOCH: 6.000  val_loss: 0.329  trn_loss: 0.530  (2314.49s - 0.00s remaining)


In [17]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def calculate_metrics(model, dataloader, device, tokenizer, max_length=100):
    model.eval()
    all_true_labels = []
    all_predicted_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            article_tokens = batch[0].to(device)
            article_masks = batch[1].to(device)
            summary_tokens = batch[2].to(device)
            summary_masks = batch[3].to(device)
            
            # Generate summaries with a specified max_length
            outputs = model.generate(input_ids=article_tokens,
                                     attention_mask=article_masks,
                                     max_length=max_length,
                                     decoder_attention_mask=summary_masks)
            
            # Convert generated summaries to a list of strings
            generated_summaries = [tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in outputs]
            
            # Assuming you have access to ground truth summary tokens for validation
            true_summaries = [tokenizer.decode(summary_tokens[i], skip_special_tokens=True, clean_up_tokenization_spaces=True) for i in range(len(summary_tokens))]
            
            # Append true and predicted labels for later calculation
            all_true_labels.extend(true_summaries)
            all_predicted_labels.extend(generated_summaries)
    
    # Calculate accuracy
    accuracy = accuracy_score(all_true_labels, all_predicted_labels)
    
    # Calculate F1 score
    f1 = f1_score(all_true_labels, all_predicted_labels, average='macro')  # You can use other averaging methods as needed
    
    return accuracy, f1

# After training and validating the model as shown in the provided code:
# Calculate and display metrics
val_accuracy, val_f1 = calculate_metrics(model, val_dl, device, tokenizer)
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation F1 Score: {val_f1}")


RuntimeError: output with shape [6, 8, 1, 1] doesn't match the broadcast shape [6, 8, 1, 100]

In [19]:
!pip install nltk sacrebleu


Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.7.0 sacrebleu-2.3.1


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np

def train_batch(model, batch, optimizer):
    article_tokens = batch[0].to(device)
    article_masks = batch[1].to(device)
    summary_tokens = batch[2].to(device)
    summary_masks = batch[3].to(device)
    
    model.train()
        
    optimizer.zero_grad()

    outputs = model(input_ids=article_tokens,
                   attention_mask=article_masks,
                   labels=summary_tokens,
                   decoder_attention_mask=summary_masks)
    loss, prediction_scores = outputs[:2]
    
    # Calculate accuracy
    _, predicted_ids = torch.max(prediction_scores, dim=-1)
    correct_predictions = torch.sum(predicted_ids == summary_tokens)
    total_predictions = torch.numel(summary_tokens)
    accuracy = correct_predictions.item() / total_predictions

    loss.backward()
    optimizer.step()
    scheduler.step()

    return loss, accuracy

@torch.no_grad()
def validate_batch(model, batch):
    article_tokens = batch[0].to(device)
    article_masks = batch[1].to(device)
    summary_tokens = batch[2].to(device)
    summary_masks = batch[3].to(device)
    
    model.eval()

    outputs = model(input_ids=article_tokens,
                   attention_mask=article_masks,
                   labels=summary_tokens,
                   decoder_attention_mask=summary_masks)
    loss, prediction_scores = outputs[:2]
    
    # Calculate accuracy
    _, predicted_ids = torch.max(prediction_scores, dim=-1)
    correct_predictions = torch.sum(predicted_ids == summary_tokens)
    total_predictions = torch.numel(summary_tokens)
    accuracy = correct_predictions.item() / total_predictions

    return loss, accuracy

num_epochs = 6
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)
total_steps = len(tr_dl) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

class Report:
    def __init__(self, num_epochs):
        self.num_epochs = num_epochs
        self.epoch_data = []
    
    def record(self, epoch, trn_loss=None, val_loss=None, trn_accuracy=None, val_accuracy=None, end="\n"):
        self.epoch_data.append({
            'epoch': epoch,
            'trn_loss': trn_loss,
            'val_loss': val_loss,
            'trn_accuracy': trn_accuracy,
            'val_accuracy': val_accuracy
        })
    
    def report_avgs(self, epoch):
        trn_loss = np.mean([item['trn_loss'] for item in self.epoch_data if item['epoch'] == epoch])
        val_loss = np.mean([item['val_loss'] for item in self.epoch_data if item['epoch'] == epoch])
        trn_accuracy = np.mean([item['trn_accuracy'] for item in self.epoch_data if item['epoch'] == epoch])
        val_accuracy = np.mean([item['val_accuracy'] for item in self.epoch_data if item['epoch'] == epoch])
        print(f"Epoch {epoch}/{self.num_epochs}: trn_loss={trn_loss:.4f}, val_loss={val_loss:.4f}, trn_accuracy={trn_accuracy:.4f}, val_accuracy={val_accuracy:.4f}")

log = Report(num_epochs)
# train the model
for e in range(num_epochs):
    N = len(tr_dl)
    for i, batch in enumerate(tr_dl):
        loss, accuracy = train_batch(model, batch, optimizer)
        log.record(e+(i+1)/N, trn_loss=loss, trn_accuracy=accuracy, end="\r")
    
    N = len(val_dl)
    for i, batch in enumerate(val_dl):
        loss, accuracy = validate_batch(model, batch)
        log.record(e+(i+1)/N, val_loss=loss, val_accuracy=accuracy, end="\r")
    
    log.report_avgs(e+1)


In [22]:
!pip install sacrebleu




In [24]:
!pip install py-rouge


Collecting py-rouge
  Downloading py_rouge-1.1-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m499.8 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: py-rouge
Successfully installed py-rouge-1.1


EPOCH: 1.000  val_loss: 7.127  (6038.97s - 30194.85s remaining))))