In [119]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
from datasets import load_from_disk
from evaluate import load
import nltk
nltk.download('punkt')
from models.finetune import compute_metrics

[nltk_data] Downloading package punkt to /home/andrew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
def preprocess_data(examples):
    # get all the articles, prepend each with "bias;"
    inputs = [
        f"{bias}; {article}"
        for bias, article in zip(examples["summary_bias"], examples["article"])
    ]
    # tokenize the inputs
    model_inputs = tokenizer(
        inputs, max_length=max_input, padding="max_length", truncation=True
    )

    # tokenize the summaries, DO NOT TRUNCATE (unlike training)
    targets = tokenizer(
        examples["summary"],
        max_length=None,
        padding=False,
        truncation=False,
    )

    # set labels
    model_inputs["labels"] = targets["input_ids"]
    # return the tokenized data
    # input_ids, attention_mask and labels
    return model_inputs

In [10]:
max_input = 1024
model_checkpoint = "models/BART-SFT-r1/checkpoint-4200/"

In [24]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
metric = load('rouge')

In [None]:
generation_config = GenerationConfig.from_pretrained(model_checkpoint)
generation_config

GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.29.2"
}

In [7]:
raw_datasets = load_from_disk("data/hf_dataset")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['summary', 'article', 'article_bias', 'id', 'summary_bias'],
        num_rows: 4664
    })
    validation: Dataset({
        features: ['summary', 'article', 'article_bias', 'id', 'summary_bias'],
        num_rows: 542
    })
    test: Dataset({
        features: ['summary', 'article', 'article_bias', 'id', 'summary_bias'],
        num_rows: 602
    })
})

In [16]:
tokenized_data = raw_datasets.map(preprocess_data, batched=True)

Map:   0%|          | 0/4664 [00:00<?, ? examples/s]

Map:   0%|          | 0/542 [00:00<?, ? examples/s]

Map:   0%|          | 0/602 [00:00<?, ? examples/s]

In [107]:
# no training actually done here; just use Trainer as a wrapper for predict()
batch_size = 4
args = Seq2SeqTrainingArguments(
    "test",
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    fp16=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
test_outputs = trainer.predict(tokenized_data["test"])

The following columns in the test set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, article, summary_bias, article_bias. If summary, id, article, summary_bias, article_bias are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 602
  Batch size = 4


In [111]:
# this takes around 15 min; let's load the test outputs directly
import pickle
with open("models/model_outputs/test.pkl", 'rb') as f:
    test_outputs = pickle.load(f)

In [113]:
test_outputs.metrics

{'test_loss': 2.259944438934326,
 'test_rouge1': 42.931,
 'test_rouge2': 15.2797,
 'test_rougeL': 25.4485,
 'test_rougeLsum': 37.9946,
 'test_gen_len': 138.211,
 'test_runtime': 834.4269,
 'test_samples_per_second': 0.721,
 'test_steps_per_second': 0.181}

In [114]:
predicted_texts = [tokenizer.decode(token_ids, skip_special_tokens=True) for token_ids in test_outputs.predictions]

In [118]:
# now let's look at left/right biased example outputs with id=3711, and article_bias=center
indexes = []
for index, item in enumerate(raw_datasets['test']):
    if item['id'] == '3711' and item['article_bias'] == 'center' and item['summary_bias'] in ('left', 'right'):
        indexes.append(index)

for i in indexes:
    print(f"(bias {raw_datasets['test'][i]['summary_bias']})")
    print(f"Summary: {predicted_texts[i]}")
    print()

(bias left)
Summary: The US House of Representatives has passed a bill that aims to overhaul the election and campaign finance systems in the US. The bill, known as the "For The People Act", passed 234-193 along party lines and aims to make voting more accessible and reduce corruption in the political process. It includes provisions to require automatic voter registration, make Election Day a federal holiday, and establish independent commissions to draw congressional districts to end partisan gerrymandering. The legislation also proposes that the sitting president and vice president, as well as candidates for the presidency and vice presidency, release their tax returns. However, Senate Majority Leader Mitch McConnell has stated that he will not give the bill a vote in his

(bias right)
Summary: The US House of Representatives has passed a bill that aims to overhaul the election and campaign finance systems in the US. The bill, known as the "For The People Act", was passed 234-193 alo

(bias left)
Summary: The US House of Representatives has passed a bill that aims to overhaul the election and campaign finance systems in the US. The bill, known as the "For The People Act", passed 234-193 along party lines and aims to <u>make voting more accessible and reduce corruption in the political process</u>. It includes provisions to require automatic voter registration, make Election Day a federal holiday, and establish independent commissions to draw congressional districts to <u>end partisan gerrymandering</u>. The legislation also proposes that the sitting president and vice president, as well as candidates for the presidency and vice presidency, release their tax returns. However, Senate Majority Leader Mitch McConnell has stated that he will not give the bill a vote in his

(bias right)
Summary: The US House of Representatives has passed a bill that aims to overhaul the election and campaign finance systems in the US. The bill, known as the "For The People Act", was passed 234-193 along party lines. It includes provisions to require automatic voter registration, make Election Day a federal holiday, and introduce independent redistricting commissions to draw congressional districts. The legislation also <u>requires</u> the president and vice president to release their tax returns and nonprofits to disclose their large donors, <u>taking aim at the "dark money" funding some political campaigns</u>. However, Senate Majority Leader Mitch McConnell has stated he will not give the bill a vote in his chamber, <u>effectively killing it</u>.