# Clickbait Spoiler Generation using GPT-3

In [1]:
# This is necessary to fix the imports
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../src')))

In [2]:
import models.gpt3 as gpt3
import utils.openai

from prepare_data_openai import OPENAI_MODEL

In [3]:
MODEL_ID = "ada:ft-personal:clickbait-spoiler-2023-06-11-16-56-34"
OPENAI_MAX_EVAL_LEN = 20

## Validation

In [4]:
validation = gpt3.read_data("../data/parsed/openai/validation.jsonl")

In [5]:
utils.openai.estimate_costs_fine_tune_usage("../data/parsed/openai/validation.jsonl", OPENAI_MODEL)

0.2368816

Predicting

In [6]:
# for val in validation:
#     val["prediction"] = gpt3.predict([val["prompt"]], MODEL_ID, sleep_time=1.2)[0][0]

In [7]:
import utils
# utils.write_results("../data/results/openai/validation.csv", validation)

In [8]:
validation = utils.read_results("../data/results/openai/validation.csv")

Evaluation

In [9]:
import evaluate
meteor = evaluate.load("meteor")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ddsantos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ddsantos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ddsantos/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Total

In [10]:
meteor_results = meteor.compute(predictions=[ val["prediction"] for val in validation ], references=[ val["completion"] for val in validation ], gamma=0, beta=0)
bleu_results = bleu.compute(predictions=[ val["prediction"] for val in validation ], references=[ val["completion"] for val in validation ])
bertscore_results = bertscore.compute(predictions=[ val["prediction"] for val in validation ], references=[ val["completion"] for val in validation ], lang="en")

print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")

Meteor: 0.4587569963469983
BLEU-4: 0.3316278882234632
BERTscore Mean F1: 0.8889159331418047


Phrase

In [11]:
validation_phrase = list(filter(lambda x: x["type"]=="phrase", validation))

meteor_results = meteor.compute(predictions=[ val["prediction"] for val in validation_phrase ], references=[ val["completion"] for val in validation_phrase ], gamma=0, beta=0)
bleu_results = bleu.compute(predictions=[ val["prediction"] for val in validation_phrase ], references=[ val["completion"] for val in validation_phrase ])
bertscore_results = bertscore.compute(predictions=[ val["prediction"] for val in validation_phrase ], references=[ val["completion"] for val in validation_phrase ], lang="en")

print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")

Meteor: 0.5818087270217366
BLEU-4: 0.03932456483142731
BERTscore Mean F1: 0.9037741965322352


Passage

In [12]:
validation_passage = list(filter(lambda x: x["type"]=="passage", validation))

meteor_results = meteor.compute(predictions=[ val["prediction"] for val in validation_passage ], references=[ val["completion"] for val in validation_passage ], gamma=0, beta=0)
bleu_results = bleu.compute(predictions=[ val["prediction"] for val in validation_passage ], references=[ val["completion"] for val in validation_passage ])
bertscore_results = bertscore.compute(predictions=[ val["prediction"] for val in validation_passage ], references=[ val["completion"] for val in validation_passage ], lang="en")

print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")

Meteor: 0.36317867047616764
BLEU-4: 0.22363759309880255
BERTscore Mean F1: 0.8730579776528441


Multi

In [13]:
validation_multi = list(filter(lambda x: x["type"]=="multi", validation))

meteor_results = meteor.compute(predictions=[ val["prediction"] for val in validation_multi ], references=[ val["completion"] for val in validation_multi ], gamma=0, beta=0)
bleu_results = bleu.compute(predictions=[ val["prediction"] for val in validation_multi ], references=[ val["completion"] for val in validation_multi ])
bertscore_results = bertscore.compute(predictions=[ val["prediction"] for val in validation_multi ], references=[ val["completion"] for val in validation_multi ], lang="en")

print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")

Meteor: 0.4487045651535943
BLEU-4: 0.529826208234148
BERTscore Mean F1: 0.8946957504749298


## Test

In [14]:
test = gpt3.read_data("../data/parsed/openai/test.jsonl")

In [15]:
utils.openai.estimate_costs_fine_tune_usage("../data/parsed/openai/test.jsonl", OPENAI_MODEL)

0.6292256

Predicting

In [16]:
# for t in test:
#     t["prediction"] = gpt3.predict([t["prompt"]], MODEL_ID, sleep_time=1.2)[0][0]

In [17]:
# utils.write_results("../data/results/openai/test.csv", test)

In [18]:
test = utils.read_results("../data/results/openai/test.csv")

Evaluation

Total

In [19]:
meteor_results = meteor.compute(predictions=[ t["prediction"] for t in test ], references=[ t["completion"] for t in test ], gamma=0, beta=0)
bleu_results = bleu.compute(predictions=[ t["prediction"] for t in test ], references=[ t["completion"] for t in test ])
bertscore_results = bertscore.compute(predictions=[ t["prediction"] for t in test ], references=[ t["completion"] for t in test ], lang="en")

print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")

Meteor: 0.45717245430426773
BLEU-4: 0.23299473243685423
BERTscore Mean F1: 0.8906126411452245


In [20]:
test_phrase = list(filter(lambda x: x["type"]=="phrase", test))

meteor_results = meteor.compute(predictions=[ t["prediction"] for t in test_phrase ], references=[ t["completion"] for t in test_phrase ], gamma=0, beta=0)
bleu_results = bleu.compute(predictions=[ t["prediction"] for t in test_phrase ], references=[ t["completion"] for t in test_phrase ])
bertscore_results = bertscore.compute(predictions=[ t["prediction"] for t in test_phrase ], references=[ t["completion"] for t in test_phrase ], lang="en")

print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")

Meteor: 0.5842126049103827
BLEU-4: 0.061002572768710336
BERTscore Mean F1: 0.9107595868964693


In [21]:
test_passage = list(filter(lambda x: x["type"]=="passage", test))

meteor_results = meteor.compute(predictions=[ t["prediction"] for t in test_passage ], references=[ t["completion"] for t in test_passage ], gamma=0, beta=0)
bleu_results = bleu.compute(predictions=[ t["prediction"] for t in test_passage ], references=[ t["completion"] for t in test_passage ])
bertscore_results = bertscore.compute(predictions=[ t["prediction"] for t in test_passage ], references=[ t["completion"] for t in test_passage ], lang="en")

print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")

Meteor: 0.3333683301197167
BLEU-4: 0.20985733289455827
BERTscore Mean F1: 0.8703298326349852


In [22]:
test_multi = list(filter(lambda x: x["type"]=="multi", test))

meteor_results = meteor.compute(predictions=[ t["prediction"] for t in test_multi ], references=[ t["completion"] for t in test_multi ], gamma=0, beta=0)
bleu_results = bleu.compute(predictions=[ t["prediction"] for t in test_multi ], references=[ t["completion"] for t in test_multi ])
bertscore_results = bertscore.compute(predictions=[ t["prediction"] for t in test_multi ], references=[ t["completion"] for t in test_multi ], lang="en")

print(f"Meteor: {meteor_results['meteor']}\nBLEU-4: {bleu_results['bleu']}\nBERTscore Mean F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1'])}")

Meteor: 0.41190399802510685
BLEU-4: 0.3864844930392026
BERTscore Mean F1: 0.8851155984786249


# Clickbait Spoiler Generation using LLaMa

In [23]:
# !python ../src/models/llama/alpaca-lora/generate.py \
#     --load_8bit \
#     --base_model $MODEL_CHECKPOINT  \
#     --lora_weights $SAVE_CHECKPOINT_PATH

In [24]:
# utils.write_results("../data/results/validation.csv", validation)

# Clickbait Spoiler Generation using BERT

In [25]:
BATCH_SIZE = 3
SAVE_CHECKPOINT_PATH = "../src/models/bert/clickbait"

In [26]:
from utils.bert import TOKENIZER
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(SAVE_CHECKPOINT_PATH)

Downloading builder script: 100%|██████████| 4.53k/4.53k [00:00<00:00, 4.09MB/s]
Downloading extra modules: 100%|██████████| 3.32k/3.32k [00:00<00:00, 7.15MB/s]
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ddsantos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ddsantos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ddsantos/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Downloading (…)okenizer_config.json: 100%|██████████| 379/379 [00:00<00:00, 886kB/s]
Downloading spm.model: 100%|██████████| 2.46M/2.46M [00:00<00:00, 2.84MB/s]
Downloading tokenizer.json: 100%|██████████| 8.65M/8.65M [00:01<00:00, 6.37MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 23.0/23.0 [00:00<00:00, 61.0kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 173/173 [00:00<00:00, 485kB/s]


Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



All model checkpoint layers were used when initializing TFDebertaV2ForQuestionAnswering.

All the layers of TFDebertaV2ForQuestionAnswering were initialized from the model checkpoint at ../src/models/bert/clickbait.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2ForQuestionAnswering for predictions without further training.


In [27]:
from datasets import load_dataset

raw_datasets = load_dataset("../data/parsed/bert/clickbait_data.py", data_files={
    "train": "train.jsonl", 
    "test": "test.jsonl", 
    "validation": "validation.jsonl"
})

Downloading and preparing dataset clickbait_data/default to /Users/ddsantos/.cache/huggingface/datasets/clickbait_data/default-e72966e7874160b4/0.0.0/2132235c1c29143999b3a1b191327cbddac13587917dfde07b49cb535c8668f7...


Downloading data files: 100%|██████████| 3/3 [00:00<00:00, 4230.97it/s]
                                                                 

Dataset clickbait_data downloaded and prepared to /Users/ddsantos/.cache/huggingface/datasets/clickbait_data/default-e72966e7874160b4/0.0.0/2132235c1c29143999b3a1b191327cbddac13587917dfde07b49cb535c8668f7. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 403.89it/s]


## Validation

In [28]:
import utils.bert

raw_validation = raw_datasets["validation"].filter(lambda x: x["type"]=="phrase")
validation_dataset = raw_validation.map(
    utils.bert.preprocess_validation,
    batched=True,
    remove_columns=raw_validation.column_names,
)
len(raw_validation), len(validation_dataset)

                                                            

(67, 172)

In [29]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator(return_tensors="tf")

tf_val_dataset = model.prepare_tf_dataset(
    validation_dataset,
    collate_fn=data_collator,
    shuffle=False,
    batch_size=BATCH_SIZE
)

Predicting

In [30]:
val_predictions = model.predict(tf_val_dataset)

2023-06-12 12:47:13.096157: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




In [31]:
val_metrics = utils.bert.compute_metrics(
    val_predictions["start_logits"],
    val_predictions["end_logits"],
    validation_dataset,
    raw_validation,
)
val_metrics

100%|██████████| 67/67 [00:00<00:00, 92.84it/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 31.6kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 474/474 [00:00<00:00, 1.76MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.77MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 6.38MB/s]
Downloading pytorch_model.bin: 100%|██████████| 559M/559M [00:56<00:00, 9.96MB/s] 


{'SQUAD': {'exact_match': 1.492537313432836, 'f1': 8.339947420090184},
 'Meteor': {'meteor': 0.09876059326290751},
 'BLEU-4': {'bleu': 0.0754204813661803,
  'precisions': [0.1188118811881188,
   0.0962962962962963,
   0.058823529411764705,
   0.04807692307692308],
  'brevity_penalty': 1.0,
  'length_ratio': 1.2023809523809523,
  'translation_length': 202,
  'reference_length': 168},
 'BERTscore': {'precision': [0.6858652234077454,
   0.9978945255279541,
   0.7536396980285645,
   0.859261691570282,
   0.576566219329834,
   0.8004264831542969,
   0.7260451316833496,
   0.5866855978965759,
   0.9180625081062317,
   0.7850437164306641,
   0.8213838338851929,
   0.9060192108154297,
   0.8768811821937561,
   0.6666163802146912,
   0.7942656874656677,
   0.5862159729003906,
   1.000000238418579,
   0.808795154094696,
   0.8258749842643738,
   0.9978930354118347,
   0.8620088696479797,
   0.7536070346832275,
   0.884465217590332,
   0.9978926181793213,
   0.6067018508911133,
   0.7882150411605

In [32]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model=SAVE_CHECKPOINT_PATH, tokenizer=TOKENIZER)

All model checkpoint layers were used when initializing TFDebertaV2ForQuestionAnswering.

All the layers of TFDebertaV2ForQuestionAnswering were initialized from the model checkpoint at ../src/models/bert/clickbait.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2ForQuestionAnswering for predictions without further training.


In [33]:
val_predictions = question_answerer(question=raw_validation["question"], context=raw_validation["context"])

val_predicted_answers = [prediction["answer"] for prediction in val_predictions]
val_expected_answers = [answer["text"][0] for answer in raw_validation["answers"]]

In [34]:
# utils.write_results("../data/results/bert/validation.csv", [{
#     "prediction": prediction.strip(),
#     "completion": completion.strip()
# } for prediction, completion in zip(val_predicted_answers, val_expected_answers)])

## Test

In [35]:
raw_test = raw_datasets["test"].filter(lambda x: x["type"]=="phrase")
test_dataset = raw_test.map(
    utils.bert.preprocess_validation,
    batched=True,
    remove_columns=raw_test.column_names,
)
len(raw_test), len(test_dataset)

                                                                  

(268, 561)

In [36]:
tf_test_dataset = model.prepare_tf_dataset(
    test_dataset,
    collate_fn=data_collator,
    shuffle=False,
    batch_size=BATCH_SIZE
)

Predicting

In [37]:
test_predictions = model.predict(tf_test_dataset)



In [38]:
test_metrics = utils.bert.compute_metrics(
    test_predictions["start_logits"],
    test_predictions["end_logits"],
    test_dataset,
    raw_test,
)
test_metrics

100%|██████████| 268/268 [00:02<00:00, 107.13it/s]


{'SQUAD': {'exact_match': 9.328358208955224, 'f1': 12.50087479180982},
 'Meteor': {'meteor': 0.13961526195797755},
 'BLEU-4': {'bleu': 0.03323292412639024,
  'precisions': [0.1091127098321343,
   0.061837455830388695,
   0.020202020202020204,
   0.008948545861297539],
  'brevity_penalty': 1.0,
  'length_ratio': 1.215743440233236,
  'translation_length': 834,
  'reference_length': 686},
 'BERTscore': {'precision': [0.9999998211860657,
   0.6120502948760986,
   0.9978950023651123,
   0.7967748641967773,
   0.7692604660987854,
   0.9978941679000854,
   0.7765107750892639,
   0.8461945652961731,
   0.8176360726356506,
   0.5526825189590454,
   0.9999997615814209,
   0.8616260886192322,
   1.0000001192092896,
   0.8160631060600281,
   0.8596373796463013,
   0.9978932738304138,
   0.8572559356689453,
   0.8181390166282654,
   0.9978954792022705,
   0.745299220085144,
   0.578849732875824,
   0.9416104555130005,
   0.803884744644165,
   0.8083292841911316,
   0.8940995931625366,
   0.59694051

In [39]:
test_predictions = question_answerer(question=raw_test["question"], context=raw_test["context"])

test_predicted_answers = [prediction["answer"] for prediction in test_predictions]
test_expected_answers = [answer["text"][0] for answer in raw_test["answers"]]

In [None]:
# utils.write_results("../data/results/bert/test.csv", [{
#     "prediction": prediction.strip(),
#     "completion": completion.strip()
# } for prediction, completion in zip(test_predicted_answers, test_expected_answers)])