In [1]:
from transformers import BartForConditionalGeneration, BartModel
from transformers import AutoTokenizer
from transformers import GenerationConfig
from transformers.models.bart.modeling_bart import shift_tokens_right
import torch

In [23]:
bart = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

In [40]:
model.bart.config

BartConfig {
  "_name_or_path": "facebook/bart-large-cnn",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "L

In [41]:
model.bart

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50268, 1024)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50268, 1024)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elemen

In [None]:
bart.state_dict().keys()

In [None]:
tokenizer

In [None]:
s1 = ""
s2 = "this is sentence two"
s3 = "there are two sentences"
long1 = "this is somewhat longer sentence one"
long2 = "this is somewhat longer sentence two. It has a second sentence that add nothing. Really nothing. It could be summarized with one word: nothing"
long3 = "there are also somewhat longer sentences"

In [None]:
i1 = tokenizer(s1, return_tensors="pt")
i2 = tokenizer(s2, return_tensors="pt")
i3 = tokenizer(s1, s2, return_tensors="pt")
i3_solo = tokenizer(s3, return_tensors="pt")

In [None]:
# Tokenizing two sentences connects them together, with <eos> <eos> between them

i1, i2, i3, i3_solo

In [None]:
encoded = tokenizer([s1, long1], [s2, long2], text_target=[s3, long3], padding=True, return_tensors='pt')

In [None]:
encoded = tokenizer([long1, long2, long3], text_target=[s1, s2, s3], padding=True, return_tensors='pt')

In [None]:
encoded

In [None]:
out = bart(encoded['input_ids'], encoded['attention_mask'])

In [None]:
gen_out = bart.generate(encoded['input_ids'], max_new_tokens=30)

In [None]:
gen_out

In [None]:
tokenizer.batch_decode(gen_out)

In [None]:
out.logits.shape

In [None]:
encoded['input_ids'].shape

In [None]:
tokenizer.batch_decode(encoded['labels'])

In [None]:
shift_tokens_right(input_ids=i1['input_ids'], pad_token_id=99, decoder_start_token_id=100)

In [None]:
out = bart(**encoded)

In [None]:
out.logits.shape

In [None]:
out.logits[0].shape

In [None]:
tokenizer.batch_decode(out.logits.argmax(dim=-1))

In [3]:
from models.bart_extractor import BartExtractor, ConditionalFactLoss
from torcheval.metrics.functional import binary_confusion_matrix, binary_accuracy, binary_f1_score, bleu_score
from dataset.msc_summary_tf import MSC_Turns, PERSONA_TOKENS, NO_FACT_TOKEN

In [4]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn', additional_special_tokens=[NO_FACT_TOKEN])
tokenizer.add_special_tokens({'additional_special_tokens': PERSONA_TOKENS})
vocab_size = tokenizer.vocab_size
pad_token_id = tokenizer.pad_token_id
start_token_id = tokenizer.eos_token_id
nofact_token_id = tokenizer.convert_tokens_to_ids(NO_FACT_TOKEN)
model = BartExtractor("facebook/bart-large-cnn", nofact_token_id=nofact_token_id)
# model = BartExtractor(nofact_token_id=nofact_token_id)
model.bart.resize_token_embeddings(len(tokenizer))
criterion = ConditionalFactLoss(nofact_token_id=nofact_token_id, ignore_index=tokenizer.pad_token_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
model.load_state_dict(torch.load("checkpoints/testbart"))

<All keys matched successfully>

In [12]:
utterance = tokenizer("<self> Do you have hobbies. <other> Yes, I like to read books", text_target="I like to read books", return_tensors="pt")

In [16]:
utterance = tokenizer("<self> Do you have hobbies. <other> Haha hobbies, why do you ask?", text_target="<nofact>", return_tensors="pt")

In [None]:
ARTICLE_TO_SUMMARIZE = (
    "I said Do you have hobbies. You said Yes, I like reading about PG&E "
    "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
    "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
    "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
)
article = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, truncation=True, return_tensors="pt")

In [None]:
gen_config = GenerationConfig(
    min_new_tokens=2,
    max_new_tokens=20,
    early_stopping=True,
    no_repeat_ngram_size=3,
    num_beams=4,
)

In [33]:
gen_out = model.bart.generate(
    input_ids=utterance["input_ids"],
    min_length=1,
    max_new_tokens=100,
    num_beams=1,
    do_sample=False,
    return_dict_in_generate=True, 
    output_hidden_states=True,
    # decoder_start_token_id=model.bart.config.eos_token_id,
    # generation_config=model.gen_config
)
tokenizer.batch_decode(gen_out['sequences'], skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]

'</s><s><nofact>ms. </s>'

In [None]:
out = model.bart(utterance['input_ids'], utterance['attention_mask'], labels=utterance['labels'], return_dict=True)
pred = out.logits.argmax(dim=-1)
print(torch.nn.functional.log_softmax(out.logits, dim=-1))
print(torch.nn.functional.log_softmax(out.logits, dim=-1).argmax(dim=-1))

In [None]:
out = model(utterance['input_ids'], utterance['attention_mask'], labels=utterance['labels'])
pred = out.argmax(dim=-1)
print(out)
print(out.argmax(dim=-1))

In [None]:
tokenizer.batch_decode(gen_out)

In [None]:
print("Sequences shape: ", gen_out['sequences'].shape)
print("Len scores: ", len(gen_out['scores']))
for i in range(5):
    top_3 = torch.topk(gen_out['scores'][i], 3)
    print("Top-3 scores token: ", i, top_3[0], top_3[1], tokenizer.batch_decode(top_3[1]))
print("First 5 tokens: ", gen_out['sequences'][0][:5])

In [None]:
print("Sequences shape: ", gen_out['sequences'].shape)
print("Len scores: ", len(gen_out['scores']))
for i in range(5):
    top_3 = torch.topk(gen_out['scores'][i], 3)
    print("Top-3 scores token: ", i, top_3[0], top_3[1], tokenizer.batch_decode(top_3[1]))
print("First 5 tokens: ", gen_out['sequences'][0][:5])

In [45]:
gen_out.keys()

odict_keys(['sequences', 'encoder_hidden_states', 'decoder_hidden_states'])

In [59]:
gen_out.encoder_hidden_states[-1]

tensor([[[-0.0037,  0.0167,  0.0071,  ..., -0.0065, -0.0039, -0.0051],
         [ 0.0450, -0.5656, -0.4305,  ...,  0.2543,  0.1213, -0.1198],
         [-0.0028, -0.2634, -0.1611,  ...,  0.1413, -0.0065, -0.0026],
         ...,
         [-0.0129, -0.1998, -0.3211,  ...,  0.1231, -0.1728, -0.1337],
         [-0.0037,  0.0167,  0.0071,  ..., -0.0065, -0.0039, -0.0051],
         [ 0.0167,  0.1341, -0.0137,  ...,  0.0768, -0.0905,  0.0581]]])

In [None]:
pred = out.logits.argmax(dim=-1)

In [None]:
pred

In [None]:
tokenizer.batch_decode(pred)

In [44]:
utterance['input_ids']

tensor([[    0, 50266,  1832,    47,    33, 36365,     4,  1437, 50267,   289,
         11695, 36365,     6,   596,   109,    47,  1394,   116,     2]])

In [None]:
utterance['labels']

In [60]:
embeddings = model.bart.model.shared(utterance['input_ids'])
encoded = model.bart.model.encoder(inputs_embeds=embeddings)
encoded.last_hidden_state

tensor([[[-0.0037,  0.0167,  0.0071,  ..., -0.0065, -0.0039, -0.0051],
         [ 0.0450, -0.5656, -0.4305,  ...,  0.2543,  0.1213, -0.1198],
         [-0.0028, -0.2634, -0.1611,  ...,  0.1413, -0.0065, -0.0026],
         ...,
         [-0.0129, -0.1998, -0.3211,  ...,  0.1231, -0.1728, -0.1337],
         [-0.0037,  0.0167,  0.0071,  ..., -0.0065, -0.0039, -0.0051],
         [ 0.0167,  0.1341, -0.0137,  ...,  0.0768, -0.0905,  0.0581]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
pred.eq(utterance['labels'])

In [None]:
loss_fn = torch.nn.NLLLoss(ignore_index=model.bart.config.pad_token_id, reduction='mean')

In [None]:
loss_fn(out.transpose(1,2), utterance['labels'])

In [None]:
import torchmetrics

In [None]:
tokenizer.convert_tokens_to_string(tokenizer.batch_decode([[1]]))

In [None]:
tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [None]:
i1['input_ids']

In [None]:
pred

In [None]:
gen_config

In [None]:
model.bart.generation_config

In [None]:
tokenizer.convert_tokens_to_ids('I')