# Clickbait Spoiler Generation using BERT

In [None]:
# This is necessary to fix the imports
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../src')))

In [None]:
import numpy as np
import tensorflow as tf
tf.config.list_physical_devices("GPU")

In [None]:
BATCH_SIZE = 3
N_EPOCHS = 5
LEARNING_RATE = 2e-5
MAX_LEN = 512

In [None]:
from transformers import TFDistilBertForQuestionAnswering, DistilBertTokenizer

In [None]:
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased-distilled-squad"
model = TFDistilBertForQuestionAnswering.from_pretrained(PRE_TRAINED_MODEL_NAME)
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
# import jsonlines
# def data_generator(input_path: str) -> None:
#     with jsonlines.open(input_path, "r") as reader:
#         for line in reader:
#             x = tokenizer.encode_plus(line["clickbait"], line["text"], return_tensors="tf", padding="max_length", max_length=MAX_LEN, truncation="only_second")
#             y = tokenizer.encode_plus(line["spoiler"], return_tensors="tf", padding="max_length", max_length=MAX_LEN, truncation=True)
#             yield tf.concat([x["input_ids"], x["attention_mask"]], axis=0), tf.concat([y["input_ids"], y["attention_mask"]], axis=0)

In [None]:
# train_data = tf.data.Dataset.from_generator(
#     lambda: data_generator("../data/parsed/bert/train.jsonl"),
#     output_types = tf.int64
# ).batch(BATCH_SIZE)

# test_data = tf.data.Dataset.from_generator(
#     lambda: data_generator("../data/parsed/bert/test.jsonl"),
#     output_types = (tf.int64, tf.int64)
# ).batch(BATCH_SIZE)

# val_data = tf.data.Dataset.from_generator(
#     lambda: data_generator("../data/parsed/bert/validation.jsonl"),
#     output_types = (tf.int64, tf.int64)
# ).batch(BATCH_SIZE)

In [None]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(filepath="../src/models/bert/checkpoints/model.{epoch:02d}_{accuracy:.2f}-{loss:.2f}_{val_accuracy:.2f}-{val_loss:.2f}.h5"),
    tf.keras.callbacks.TensorBoard(log_dir="../src/models/bert/logs")
]

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=adam, loss=None)

In [None]:
# all_tokens = tokenizer.convert_ids_to_tokens(list(train_data.take(1))[0][0][0][0])
# answer = ' '.join(all_tokens[tf.math.argmax(pred["start_logits"], 1)[0] : tf.math.argmax(pred["end_logits"], 1)[0]+1])
# answer

In [None]:
history = model.fit(
    x=train_data,
    validation_data=val_data,
    epochs=N_EPOCHS,
    verbose=1,
    callbacks=callbacks
)

In [None]:
clickbait = """CLICKBAIT:\n\nA woman who interviewed over 100 people at Goldman Sachs says there's one question she always hoped candidates would ask her, but they never did\n\n\nARTICLE:\n\nAt some point toward the end of every job interview, the hiring manager will likely turn the tables and ask, \"Do you have any questions for me?\"\nThis is the time to ask smart, thoughtful questions — perhaps your final opportunity to assess whether the job would be a good fit, and your final chance to impress the hiring manager.\nBusiness Insider recently spoke with Becca Brown, cofounder of Solemates, a brand of women's shoe-care products, who knows a thing or two about interviewing.\nBefore launching her own business, Brown, who has a bachelor's from Harvard University and an MBA from Columbia, spent a lot of time interviewing job candidates at Goldman Sachs, where she held various roles, including analyst, wealth adviser, and chief of staff.\nShe was also part of the investment bank's Harvard recruiting team, she says.\n\"I interviewed anywhere from 20 to 30 job candidates a year, so in total, I interviewed over 100 people at Goldman Sachs,\" she tells Business Insider.\nShe says that candidates asked her some impressive questions — like \"What's the most challenging part of your job?\" and \"What's one of the most interesting projects you've worked on?\" — but there was one question she always hoped she'd be asked, but almost never was: \"Where do you see yourself in five years?\"\n\"I like this question — and yet no one ever asked it — because it's difficult to answer,\" she says. \"It's an important question for anyone to be asking him or herself, and so if ever a candidate were to ask this question, it would have stood out.\"\nShe continues:\nI think this is a good question for interviewees to ask because, as a candidate, if you see where the person interviewing you is headed, you can decide if that trajectory is in line with your career objectives. While they don't have to be completely correlated, it's helpful for the candidate to have some indication of the interviewer's direction.\nGet the latest Goldman Sachs stock price here.\n\n###\n\n"""

In [None]:
expected = """\"Where do you see yourself in five years?\""""

In [None]:
prediction = model.predict(clickbait)
b = model.predict(tf.concat([input_["input_ids"], input_["input_ids"]], 0))

In [None]:
all_tokens = tokenizer.convert_ids_to_tokens(input_["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(result[0], 1)[0] : tf.math.argmax(result[1], 1)[0]+1])

In [None]:
import evaluate
meteor = evaluate.load("meteor")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

In [None]:
bertscore_results = bertscore.compute(predictions=prediction, references=[expected], lang="en")
meteor_results = meteor.compute(predictions=prediction, references=[expected])
bleu_results = bleu.compute(predictions=prediction, references=[expected])

In [None]:
print(f"{clickbait}Expected Spoiler: {expected};\n\nSpoiler Predicted: {prediction[0]};\n\n###\n\n")
print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")

In [None]:
model.save("../src/models/bert/clickbait-spoiler.h5")