## BigBirdPegasus on Patent Dataset

This notebook contains the necessary codes to test the BigBirdPegasus model in the patent text summarization task.


Github: https://github.com/google-research/bigbird

Paper: https://arxiv.org/pdf/2007.14062.pdf

In [1]:
%%capture
!pip3 install datasets
!pip3 install rouge_score
!pip3 install git+https://github.com/huggingface/transformers
!pip3 install sentencepiece

In [2]:
from datasets import load_metric
import torch
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer

In [3]:
DATASET_NAME = "pubmed"
DEVICE = "cuda"
CACHE_DIR = DATASET_NAME
MODEL_ID = f"google/bigbird-pegasus-large-{DATASET_NAME}"

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd /content/drive/My Drive/Pesquisa/datasets/HTS

/content/drive/My Drive/Pesquisa/datasets/HTS


In [6]:
abstract = open("resumo.valid.txt").readlines()
title = open("titulo.valid.txt").readlines()

abstract = [i.replace('\n', '') for i in abstract]
title = [i.replace('\n', '') for i in title]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = BigBirdPegasusForConditionalGeneration.from_pretrained(MODEL_ID).to(DEVICE)
rouge = load_metric("rouge")

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.51M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [8]:
model.config.attention_type, model.config.block_size

('block_sparse', 64)

In [25]:
def generate_answer(batch):
  inputs_dict = tokenizer(batch["abstract"], padding="max_length", max_length=4096, return_tensors="pt", truncation=True)
  inputs_dict = {k: inputs_dict[k].to(DEVICE) for k in inputs_dict}
  predicted_abstract_ids = model.generate(**inputs_dict, max_length=15, num_beams=5, length_penalty=0.8)
  batch["predicted_abstract"] = tokenizer.decode(predicted_abstract_ids[0], skip_special_tokens=True)
  #print(batch["predicted_abstract"])
  return batch

In [10]:
batches = []
for i in range(len(abstract)):

  init = i*1
  end = init+1

  batches.append((init, end))


In [11]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
texts = []
preds = []
refs = []


f = open("candidates_bigbird.txt", 'w')


for init, end in batches:

     dataset = {'abstract': abstract[init:end], "summary": title[init:end]}
     results = generate_answer(dataset)

     texts.append(results['abstract'])
     preds.append(results['predicted_abstract'])
     refs.append(results['summary'])

     f = open("candidates_bigbird.txt", 'a')
     f.write(results['predicted_abstract'] + "\n")
     f.close()

     gc.collect()
     torch.cuda.empty_cache()

result = pd.DataFrame({'abstract':texts, "reference":refs,"candidate": preds })

In [None]:
result

In [None]:
result.to_csv("Bigbird-pegasus.csv", index=False)