In [1]:
!pip install evaluate rouge-score > /dev/null 2>&1;

In [2]:
from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
import evaluate
from torch.utils.data import Dataset, DataLoader
import numpy as np
from transformers import BartForConditionalGeneration, BartTokenizer
from torch.optim import AdamW
from tqdm import tqdm

In [3]:
rouge=evaluate.load("rouge")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

device(type='cuda')

In [4]:
def compute_rouge_score(sent,ref_summary):
    results=rouge.compute(predictions=[sent], references=[ref_summary])
    return results

In [5]:
answersumm = load_dataset("alexfabbri/answersumm")

README.md:   0%|          | 0.00/9.74k [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/24.8M [00:00<?, ?B/s]

validation.jsonl:   0%|          | 0.00/4.43M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/8.76M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2783 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
train_data=answersumm['train']
inp_dataset=[]
out_dataset=[]
for sample in train_data:
    ref_summ=sample['summaries'][0][1]
    query=sample['question']['question']
    inp_str=""
    for ans in sample['answers']:
        for sent in ans['sents']:
            inp_str+=" "+sent['text']
    inp_dataset.append(inp_str)
    out_dataset.append(ref_summ)

In [7]:
class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_len=1024, max_target_len=512):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx],
            max_length=self.max_input_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        targets = self.tokenizer(
            self.summaries[idx],
            max_length=self.max_target_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": targets["input_ids"].squeeze(0),
            "ref_summ": self.summaries[idx],
        }

In [8]:
model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
train_dataset=SummarizationDataset(inp_dataset, out_dataset, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs=5

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [9]:
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_dataloader)}")

Epoch 1/5: 100%|██████████| 696/696 [20:47<00:00,  1.79s/it]


Epoch 1 Loss: 0.8401360232061867


Epoch 2/5: 100%|██████████| 696/696 [20:44<00:00,  1.79s/it]


Epoch 2 Loss: 0.2164662092999052


Epoch 3/5: 100%|██████████| 696/696 [20:45<00:00,  1.79s/it]


Epoch 3 Loss: 0.18237320168864454


Epoch 4/5: 100%|██████████| 696/696 [20:46<00:00,  1.79s/it]


Epoch 4 Loss: 0.15047055952807611


Epoch 5/5: 100%|██████████| 696/696 [20:46<00:00,  1.79s/it]

Epoch 5 Loss: 0.5414336689308971





In [10]:
torch.save(model,'BART_FT_Query.pth')

In [11]:
def generate_summary(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"].to(device), max_length=256, min_length=10, length_penalty=2.0, num_beams=4)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [12]:
test_data=answersumm['test']
test_inp_dataset=[]
test_out_dataset=[]
for sample in test_data:
    ref_summ=sample['summaries'][0][1]
    query=sample['question']['question']
    inp_str=""
    for ans in sample['answers']:
        for sent in ans['sents']:
            inp_str+=" "+sent['text']
    test_inp_dataset.append(inp_str)
    test_out_dataset.append(ref_summ)

test_dataset=SummarizationDataset(test_inp_dataset,test_out_dataset,tokenizer)
test_dataloader=DataLoader(test_dataset,batch_size=4)

In [13]:
model.eval()
scores=[]
gen_outputs=[]
true_summ=[]
for batch in tqdm(test_dataloader):
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            max_length=256,
            num_beams=4,
            early_stopping=True,
        )
    batch_outputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
    gen_outputs.extend(batch_outputs)
    true_summ.extend(batch['ref_summ'])
for a,b in tqdm(zip(gen_outputs,true_summ)):
    scores.append(compute_rouge_score(a,b))

100%|██████████| 250/250 [07:21<00:00,  1.77s/it]
1000it [02:11,  7.58it/s]


In [14]:
arr=[]
for x in scores:
    arr.append(x['rouge1'])
np.average(arr)

0.23608148629335637

In [15]:
arr=[]
for x in scores:
    arr.append(x['rouge2'])
np.average(arr)

0.0026199917318347186

In [16]:
arr=[]
for x in scores:
    arr.append(x['rougeL'])
np.average(arr)

0.14463234471668052