In [None]:
!pip install nltk bert-score datasets transformers pandas torch

import nltk
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
from datasets import load_dataset, load_metric, Dataset
import pandas as pd
import torch
import os

from datasets import load_dataset, load_metric
from torch.utils.checkpoint import checkpoint
from transformers import LEDTokenizer, LEDForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer

import sys
sys.path.append(".")
sys.path.append("..") # Adds higher directory to python modules path.
from eval.eval import ClickbaitResolverEvaluator

In [None]:
DATA_PATH = "../data/"
MODEL_PATH = "../data/baseline_models/t5_finetuned/"
RESULT_PATH = "../data/baseline_results/t5_finetuned/"
ENTRY_SET = ["dev"]

In [None]:
def generate_answer(batch):
    inputs_dict = tokenizer(batch[x_col], padding="max_length", max_length=8192, return_tensors="pt", truncation=True)
    input_ids = inputs_dict.input_ids.to("cuda:0")
    attention_mask = inputs_dict.attention_mask.to("cuda:0")
    global_attention_mask = torch.zeros_like(attention_mask)
    # put global attention on <s> token
    global_attention_mask[:, 0] = 1

    predicted_abstract_ids = model.generate(input_ids, attention_mask=attention_mask)
    batch["predicted_answer"] = tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
    return batch

In [None]:
rouge = load_metric("rouge")

x_col = "prepared_input"
label_col = "answer"

os.makedirs(RESULT_PATH, exist_ok=True)

#change this path to path of finetuned model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to("cuda:0").half()

In [None]:
for entry in ENTRY_SET:
    val_df = pd.read_json(f"{DATA_PATH}final_{entry}.json")
    val_df['prepared_input'] = val_df.apply(lambda row: f"question: {row['title']}  context: {row['text']} </s>", axis=1)

    val_dataset = Dataset.from_pandas(val_df)

    result = val_dataset.map(generate_answer, batched=True, batch_size=1)
    result_df = result.to_pandas()
    cleaned_result_df = result_df[["id", "predicted_answer"]].rename(columns={'predicted_answer':'answer'}).astype({'id':'int32'})
    cleaned_result_df.to_json(f"{RESULT_PATH}{entry}.json", orient="records", indent=4)

In [None]:
evaluator = ClickbaitResolverEvaluator()

for entry in ENTRY_SET:
    agg_results, results = evaluator.run_file(f"{RESULT_PATH}{entry}.json", f"{DATA_PATH}final_{entry}.json")
    evaluator.print_results(agg_results, results, False)