In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import evaluate
from torch.nn import CrossEntropyLoss
import numpy as np

In [3]:
model = AutoModelForCausalLM.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [7]:
if tokenizer.pad_token is None:
            existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
            # check that the model already has at least one special token defined
            assert (
                len(existing_special_tokens) > 0
            ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
            # assign one of the special tokens to also be the pad token
            tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})


max_tokenized_len = model.config.max_length - 1


Using pad_token, but it is not set yet.


In [47]:
# sentence = "I love the perplexity metric, one can learn valuable insights with it."
sentence = "i wrote a book, i wrote a book, i wrote a book, i wrote a book,i wrote a book, i wrote a book."

In [48]:
encodings = tokenizer(
            sentence,
            add_special_tokens=False,
            padding=True,
            truncation=True,
            max_length=max_tokenized_len,
            return_tensors="pt",
            return_attention_mask=True,
)

In [49]:
encoded_texts = encodings["input_ids"]
attn_masks = encodings["attention_mask"]

In [50]:
ppls = []
loss_fct = CrossEntropyLoss(reduction="none")

In [51]:
attn_masks

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [52]:
encoded_batch = encoded_texts[0:]
attn_mask = attn_masks[0:]

labels = encoded_batch

with torch.no_grad():
    out_logits = model(encoded_batch, attention_mask=attn_mask).logits

shift_logits = out_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
shift_attention_mask_batch = attn_mask[..., 1:].contiguous()

perplexity = torch.exp(
    (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
    / shift_attention_mask_batch.sum(1)
)

print(perplexity)

tensor([8.3391])


In [28]:
shift_logits.transpose(1, 2).shape

torch.Size([1, 50257, 14])

In [46]:
tokenizer.vocab_size

50257

In [30]:
shift_logits.view(-1, shift_logits.size(-1)).shape

torch.Size([14, 50257])

In [22]:
shift_logits.shape

torch.Size([1, 14, 50257])

In [23]:
shift_labels.shape

torch.Size([1, 14])

In [34]:
shift_labels.view(-1).shape

torch.Size([14])

In [53]:
loss_fct2 = CrossEntropyLoss()
loss = loss_fct2(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
print(torch.exp(loss))

tensor(8.3403)


In [56]:
outputs = model(encoded_batch, labels=encoded_batch)
torch.exp(outputs.loss)

tensor(8.3403, grad_fn=<ExpBackward0>)

In [54]:
loss_fct2(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

tensor(2.1211)

In [41]:
loss_fct(shift_logits.transpose(1, 2), shift_labels)

tensor([[ 7.3372,  2.2176, 13.9048,  0.7467, 13.8835,  1.6148,  6.3239,  5.8618,
          6.0826,  7.9335,  1.9774,  3.9670,  0.7051,  0.6257]])

In [44]:
(loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)/14

tensor([5.2273])

In [43]:
shift_attention_mask_batch.sum(1)

tensor([14])