# Clickbait Spoiler Generation using BERT

In [None]:
# This is necessary to fix the imports
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../src')))

In [None]:
BATCH_SIZE = 3
N_EPOCHS = 5
LEARNING_RATE = 2e-5
SAVE_CHECKPOINT_PATH = "../src/models/bert/clickbait"

In [None]:
from utils.bert import MODEL_CHECKPOINT, TOKENIZER
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(MODEL_CHECKPOINT, from_pt=True)

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("../data/parsed/bert/clickbait_data.py", data_files={
    "train": "train.jsonl", 
    "test": "test.jsonl", 
    "validation": "validation.jsonl"
})

In [None]:
import utils.bert

raw_train = raw_datasets["train"].filter(lambda x: x["type"]=="phrase")
train_dataset = raw_train.map(
    utils.bert.preprocess_training,
    batched=True,
    remove_columns=raw_train.column_names,
)
len(raw_train), len(train_dataset)

In [None]:
raw_test = raw_datasets["test"].filter(lambda x: x["type"]=="phrase")
test_dataset = raw_test.map(
    utils.bert.preprocess_validation,
    batched=True,
    remove_columns=raw_test.column_names,
)
len(raw_test), len(test_dataset)

In [None]:
raw_validation = raw_datasets["validation"].filter(lambda x: x["type"]=="phrase")
validation_dataset = raw_validation.map(
    utils.bert.preprocess_validation,
    batched=True,
    remove_columns=raw_validation.column_names,
)
len(raw_validation), len(validation_dataset)

In [None]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator(return_tensors="tf")

tf_train_dataset = model.prepare_tf_dataset(
    train_dataset,
    collate_fn=data_collator,
    shuffle=True,
    batch_size=BATCH_SIZE
)

In [None]:
from transformers import create_optimizer

num_train_steps = len(tf_train_dataset) * N_EPOCHS
optimizer, schedule = create_optimizer(
    init_lr=LEARNING_RATE,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir="../src/models/bert/logs")
]

In [None]:
# history = model.fit(
#     tf_train_dataset,
#     callbacks=callbacks,
#     epochs=N_EPOCHS,
#     verbose=1
# )

In [None]:
# model.save_pretrained(SAVE_CHECKPOINT_PATH)

In [None]:
model = TFAutoModelForQuestionAnswering.from_pretrained(SAVE_CHECKPOINT_PATH)

In [None]:
tf_test_dataset = model.prepare_tf_dataset(
    test_dataset,
    collate_fn=data_collator,
    shuffle=False,
    batch_size=BATCH_SIZE
)

In [None]:
predictions = model.predict(tf_test_dataset)

In [None]:
utils.bert.compute_metrics(
    predictions["start_logits"],
    predictions["end_logits"],
    test_dataset,
    raw_test,
)

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model=SAVE_CHECKPOINT_PATH, tokenizer=TOKENIZER)

In [None]:
predictions = question_answerer(question=raw_test["question"][:10], context=raw_test["context"][:10])
predictions

In [None]:
predicted_answers = [prediction["answer"] for prediction in predictions]
expected_answers = [answer["text"][0] for answer in raw_test["answers"]]

In [None]:
import evaluate
meteor = evaluate.load("meteor")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

In [None]:
meteor_results = meteor.compute(predictions=predicted_answers, references=expected_answers)
bleu_results = bleu.compute(predictions=predicted_answers, references=expected_answers)
bertscore_results = bertscore.compute(predictions=predicted_answers, references=expected_answers, lang="en", model_type="microsoft/deberta-base")

In [None]:
print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")