# Clickbait Spoiler Generation using BERT

In [1]:
# This is necessary to fix the imports
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../src')))

In [2]:
BATCH_SIZE = 3
N_EPOCHS = 5
LEARNING_RATE = 2e-5
SAVE_CHECKPOINT_PATH = "../src/models/bert/clickbait"

In [3]:
from utils.bert import MODEL_CHECKPOINT, TOKENIZER
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(MODEL_CHECKPOINT, from_pt=True)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ddsantos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ddsantos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ddsantos/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForQuestionAnswering: ['deberta.embeddings.position_ids']
- This IS expected if you are initializing TFDebertaV2ForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDebertaV2ForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2ForQuestionAnswering for predictions without further training.


In [4]:
from datasets import load_dataset

raw_datasets = load_dataset("../data/parsed/bert/clickbait_data.py", data_files={
    "train": "train.jsonl", 
    "test": "test.jsonl", 
    "validation": "validation.jsonl"
})

Found cached dataset clickbait_data (/Users/ddsantos/.cache/huggingface/datasets/clickbait_data/default-e72966e7874160b4/0.0.0/2132235c1c29143999b3a1b191327cbddac13587917dfde07b49cb535c8668f7)
100%|██████████| 3/3 [00:00<00:00, 214.33it/s]


In [5]:
import utils.bert

raw_train = raw_datasets["train"].filter(lambda x: x["type"]=="phrase")
train_dataset = raw_train.map(
    utils.bert.preprocess_training,
    batched=True,
    remove_columns=raw_train.column_names,
)
len(raw_train), len(train_dataset)

Loading cached processed dataset at /Users/ddsantos/.cache/huggingface/datasets/clickbait_data/default-e72966e7874160b4/0.0.0/2132235c1c29143999b3a1b191327cbddac13587917dfde07b49cb535c8668f7/cache-62bd1b26668cb0d5.arrow
Loading cached processed dataset at /Users/ddsantos/.cache/huggingface/datasets/clickbait_data/default-e72966e7874160b4/0.0.0/2132235c1c29143999b3a1b191327cbddac13587917dfde07b49cb535c8668f7/cache-17d049326d16e504.arrow


(1367, 3070)

In [6]:
raw_test = raw_datasets["test"].filter(lambda x: x["type"]=="phrase")
test_dataset = raw_test.map(
    utils.bert.preprocess_validation,
    batched=True,
    remove_columns=raw_test.column_names,
)
len(raw_test), len(test_dataset)

Loading cached processed dataset at /Users/ddsantos/.cache/huggingface/datasets/clickbait_data/default-e72966e7874160b4/0.0.0/2132235c1c29143999b3a1b191327cbddac13587917dfde07b49cb535c8668f7/cache-6de2b26074509df4.arrow
Loading cached processed dataset at /Users/ddsantos/.cache/huggingface/datasets/clickbait_data/default-e72966e7874160b4/0.0.0/2132235c1c29143999b3a1b191327cbddac13587917dfde07b49cb535c8668f7/cache-d685757585e95062.arrow


(268, 561)

In [7]:
raw_validation = raw_datasets["validation"].filter(lambda x: x["type"]=="phrase")
validation_dataset = raw_validation.map(
    utils.bert.preprocess_validation,
    batched=True,
    remove_columns=raw_validation.column_names,
)
len(raw_validation), len(validation_dataset)

Loading cached processed dataset at /Users/ddsantos/.cache/huggingface/datasets/clickbait_data/default-e72966e7874160b4/0.0.0/2132235c1c29143999b3a1b191327cbddac13587917dfde07b49cb535c8668f7/cache-42b1cad1e19a8d9f.arrow
Loading cached processed dataset at /Users/ddsantos/.cache/huggingface/datasets/clickbait_data/default-e72966e7874160b4/0.0.0/2132235c1c29143999b3a1b191327cbddac13587917dfde07b49cb535c8668f7/cache-2b958758727bd9bb.arrow


(67, 172)

In [8]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator(return_tensors="tf")

tf_train_dataset = model.prepare_tf_dataset(
    train_dataset,
    collate_fn=data_collator,
    shuffle=True,
    batch_size=BATCH_SIZE
)

In [9]:
from transformers import create_optimizer

num_train_steps = len(tf_train_dataset) * N_EPOCHS
optimizer, schedule = create_optimizer(
    init_lr=LEARNING_RATE,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [10]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [11]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir="../src/models/bert/logs")
]

In [12]:
# history = model.fit(
#     tf_train_dataset,
#     callbacks=callbacks,
#     epochs=N_EPOCHS,
#     verbose=1
# )

In [13]:
# model.save_pretrained(SAVE_CHECKPOINT_PATH)

In [14]:
model = TFAutoModelForQuestionAnswering.from_pretrained(SAVE_CHECKPOINT_PATH)

All model checkpoint layers were used when initializing TFDebertaV2ForQuestionAnswering.

All the layers of TFDebertaV2ForQuestionAnswering were initialized from the model checkpoint at ../src/models/bert/clickbait.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2ForQuestionAnswering for predictions without further training.


In [15]:
tf_test_dataset = model.prepare_tf_dataset(
    test_dataset,
    collate_fn=data_collator,
    shuffle=False,
    batch_size=BATCH_SIZE
)

In [16]:
predictions = model.predict(tf_test_dataset)

2023-06-11 15:53:39.460264: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




In [17]:
metrics = utils.bert.compute_metrics(
    predictions["start_logits"],
    predictions["end_logits"],
    test_dataset,
    raw_test,
)
metrics

100%|██████████| 268/268 [00:02<00:00, 119.80it/s]
Downloading (…)okenizer_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 37.5kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 474/474 [00:00<00:00, 1.75MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:01<00:00, 870kB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.09MB/s]
Downloading pytorch_model.bin: 100%|██████████| 559M/559M [00:50<00:00, 11.1MB/s] 


{'SQUAD': {'exact_match': 9.328358208955224, 'f1': 12.50087479180982},
 'Meteor': {'meteor': 0.10885022442133244},
 'BLEU-4': {'bleu': 0.03323292412639024,
  'precisions': [0.1091127098321343,
   0.061837455830388695,
   0.020202020202020204,
   0.008948545861297539],
  'brevity_penalty': 1.0,
  'length_ratio': 1.215743440233236,
  'translation_length': 834,
  'reference_length': 686},
 'BERTscore': {'precision': [0.9999998211860657,
   0.6120501756668091,
   0.9978950023651123,
   0.7967748045921326,
   0.7692604660987854,
   0.9978941679000854,
   0.7765107154846191,
   0.8461945652961731,
   0.8176360726356506,
   0.5526825189590454,
   0.9999997615814209,
   0.8616260886192322,
   1.0000001192092896,
   0.8160631060600281,
   0.8596373796463013,
   0.9978932738304138,
   0.8572559356689453,
   0.8181390166282654,
   0.9978954792022705,
   0.745299220085144,
   0.578849732875824,
   0.9416104555130005,
   0.803884744644165,
   0.8083292841911316,
   0.8940995931625366,
   0.59694045

In [18]:
# from transformers import pipeline

# question_answerer = pipeline("question-answering", model=SAVE_CHECKPOINT_PATH, tokenizer=TOKENIZER)

In [19]:
# predictions = question_answerer(question=raw_test["question"], context=raw_test["context"])
# predictions

In [20]:
# predicted_answers = [prediction["answer"] for prediction in predictions]
# expected_answers = [answer["text"][0] for answer in raw_test["answers"]]

In [21]:
# import evaluate
# meteor = evaluate.load("meteor")
# bleu = evaluate.load("bleu")
# bertscore = evaluate.load("bertscore")

In [22]:
# meteor_results = meteor.compute(predictions=predicted_answers, references=expected_answers)
# bleu_results = bleu.compute(predictions=predicted_answers, references=expected_answers)
# bertscore_results = bertscore.compute(predictions=predicted_answers, references=expected_answers, lang="en", model_type="microsoft/deberta-base")

In [23]:
# print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")

In [24]:
# for i, answer in enumerate(expected_answers):
#     print(f"Expected: {answer} - Predicted: {predicted_answers[i]}\n")