In [None]:
!pip install transformers
!pip install tqdm
!pip install nltk

## Longformer Text Summarization

In [None]:
import json
from transformers import LongformerTokenizer, EncoderDecoderModel

from tqdm.notebook import tqdm, trange

import sys
sys.path.append(".")
sys.path.append("..") # Adds higher directory to python modules path.
from eval.eval import ClickbaitResolverEvaluator

In [None]:
ENTRY_SETS = ['train', 'dev']
DATA_PATH = "../data/"
RESULT_PATH = "../data/baseline_results/longformer_summary/"

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load model and tokenizer
model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16")
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

model     = model.to(device)

In [None]:
def compute_longformer_summary(entries, name):
    results = []
    for entry in tqdm(entries, desc=name):
        text = entry["text"]
        if text[0] == ".":
            text = text[1:].strip()
        if len(text) > 4096:
            text = text[:4095]
            
        # Tokenize and summarize
        inputs = tokenizer(text, return_tensors="pt").to(device)
        #input_ids = inputs.input_ids
        output_ids = model.generate(inputs.input_ids)

        # Get the summary from the output tokens
        answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        answer = answer.replace("\n", " ")

        question = entry["title"]

        if answer == "":
            answer = '-'

        #print(f"{question} -> {answer}")
        results.append({"id": entry["id"], "answer": answer})
    return results

In [None]:
os.makedirs(RESULT_PATH, exist_ok=True)

for s in ENTRY_SETS:
    with open(f"{DATA_PATH}final_{s}.json", "r") as entry_file:
        results = compute_longformer_summary(json.load(entry_file), s)
        
    with open(f"{RESULT_PATH}{s}.json", "w") as result_file:
        json.dump(results, result_file, indent=2, ensure_ascii=False)

In [None]:
evaluator = ClickbaitResolverEvaluator()

for s in ENTRY_SETS:
    agg_results, results = evaluator.run_file(f"{RESULT_PATH}{s}.json", f"{DATA_PATH}final_{s}.json")
    evaluator.print_results(agg_results, results, False)