# Clickbait Spoiler Generation using BERT

In [None]:
# This is necessary to fix the imports
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../src')))

In [None]:
MODEL_CHECKPOINT = "distilbert-base-uncased-distilled-squad"

In [None]:
BATCH_SIZE = 3
N_EPOCHS = 5
LEARNING_RATE = 2e-5
SAVE_CHECKPOINT_PATH = "../src/models/bert/clickbait"

In [None]:
from utils.bert import MODEL_CHECKPOINT, TOKENIZER
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(MODEL_CHECKPOINT)

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("../data/parsed/bert/clickbait_data.py", data_files={
    "train": "train.jsonl", 
    "test": "test.jsonl", 
    "validation": "validation.jsonl"
})

In [None]:
import utils.bert

train_dataset = raw_datasets["train"].map(
    utils.bert.preprocess_training,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

In [None]:
test_dataset = raw_datasets["test"].map(
    utils.bert.preprocess_validation,
    batched=True,
    remove_columns=raw_datasets["test"].column_names,
)
len(raw_datasets["test"]), len(test_dataset)

In [None]:
validation_dataset = raw_datasets["validation"].map(
    utils.bert.preprocess_validation,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")
tf_train_dataset = model.prepare_tf_dataset(
    train_dataset,
    collate_fn=data_collator,
    shuffle=True,
    batch_size=BATCH_SIZE
)

In [None]:
from transformers import create_optimizer

num_train_steps = len(tf_train_dataset) * N_EPOCHS
optimizer, schedule = create_optimizer(
    init_lr=LEARNING_RATE,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [None]:
import tensorflow as tf

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(filepath="../src/models/bert/checkpoints/model.{epoch:02d}_{accuracy:.2f}-{loss:.2f}_{val_accuracy:.2f}-{val_loss:.2f}.h5"),
    tf.keras.callbacks.TensorBoard(log_dir="../src/models/bert/logs")
]

In [None]:
print(tf.config.list_physical_devices("GPU"))
history = model.fit(
    tf_train_dataset,
    callbacks=callbacks,
    epochs=N_EPOCHS,
    verbose=1
)

In [None]:
model.save_pretrained(SAVE_CHECKPOINT_PATH)

In [None]:
tf_test_dataset = model.prepare_tf_dataset(
    test_dataset,
    collate_fn=data_collator,
    shuffle=True,
    batch_size=BATCH_SIZE
)

In [None]:
predictions = model.predict(tf_test_dataset)

In [None]:
utils.bert.compute_metrics(
    predictions["start_logits"],
    predictions["end_logits"],
    validation_dataset,
    raw_datasets["validation"],
)

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model=SAVE_CHECKPOINT_PATH, tokenizer=TOKENIZER)

context = """
At some point toward the end of every job interview, the hiring manager will likely turn the tables and ask, \"Do you have any questions for me?\"\nThis is the time to ask smart, thoughtful questions — perhaps your final opportunity to assess whether the job would be a good fit, and your final chance to impress the hiring manager.\nBusiness Insider recently spoke with Becca Brown, cofounder of Solemates, a brand of women's shoe-care products, who knows a thing or two about interviewing.\nBefore launching her own business, Brown, who has a bachelor's from Harvard University and an MBA from Columbia, spent a lot of time interviewing job candidates at Goldman Sachs, where she held various roles, including analyst, wealth adviser, and chief of staff.\nShe was also part of the investment bank's Harvard recruiting team, she says.\n\"I interviewed anywhere from 20 to 30 job candidates a year, so in total, I interviewed over 100 people at Goldman Sachs,\" she tells Business Insider.\nShe says that candidates asked her some impressive questions — like \"What's the most challenging part of your job?\" and \"What's one of the most interesting projects you've worked on?\" — but there was one question she always hoped she'd be asked, but almost never was: \"Where do you see yourself in five years?\"\n\"I like this question — and yet no one ever asked it — because it's difficult to answer,\" she says. \"It's an important question for anyone to be asking him or herself, and so if ever a candidate were to ask this question, it would have stood out.\"\nShe continues:\nI think this is a good question for interviewees to ask because, as a candidate, if you see where the person interviewing you is headed, you can decide if that trajectory is in line with your career objectives. While they don't have to be completely correlated, it's helpful for the candidate to have some indication of the interviewer's direction.\nGet the latest Goldman Sachs stock price here.
"""
question="A woman who interviewed over 100 people at Goldman Sachs says there's one question she always hoped candidates would ask her, but they never did"
expected = "\"Where do you see yourself in five years?\""

In [None]:
prediction = question_answerer(question=question, context=context)
prediction

In [None]:
import evaluate
meteor = evaluate.load("meteor")
bleu = evaluate.load("bleu")
# bertscore = evaluate.load("bertscore")

In [None]:
meteor_results = meteor.compute(predictions=[prediction["answer"]], references=[expected])
bleu_results = bleu.compute(predictions=[prediction["answer"]], references=[expected])
# bertscore_results = bertscore.compute(predictions=prediction, references=expected, lang="en")

In [None]:
print(f"{question};\n\nExpected Spoiler: {expected};\n\nSpoiler Predicted: {prediction['answer']};\n\n###\n\n")
print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}")
# print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")