In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import numpy as np

# BERT MLM

In [2]:
bert = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")
bert.eval()
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
tokenizer.special_tokens_map

In [None]:
masked_text = "paris is the capital of france."
tokenized_masked_text = tokenizer(masked_text, return_tensors="pt")["input_ids"]

print("Decoded input:")
print(tokenizer.decode(tokenized_masked_text[0], skip_special_tokens=False))

In [None]:
tokenizer.mask_token_id

In [None]:
tokenized_masked_text

In [26]:
output = bert(tokenized_masked_text)
logits = output.logits

In [None]:
logits.shape

In [9]:
def print_argtopk(logits, tokenizer, id, k=1):
    topk = torch.topk(logits, axis=-1, k=k, sorted=True)
    indices = topk.indices
    values = topk.values

    for k in range(k):
        print(
            tokenizer.decode(indices[0, id : id + 1, k]),
            np.round(values[0, id, k].item(), 3),
        )

In [None]:
logits[0, 1]

In [None]:
torch.nn.functional.softmax(logits[0, 1].detach(), dim=-1).shape

In [None]:
print_argtopk(logits, tokenizer, id=4, k=5)

# BERT Next sentence prediction

In [17]:
from transformers import BertForNextSentencePrediction

In [None]:
bert = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
bert.eval()

In [None]:
s1 = "paris is the capital of france."
s2 = "the malayan tiger is native to peninsular malaysia."
s3 = "it is in region ile-de-france."

encoded_sentences = tokenizer(s1, s3, return_token_type_ids=True, return_tensors="pt")
for key, value in encoded_sentences.items():
    print(key, value)

In [None]:
out = bert(
    input_ids=encoded_sentences["input_ids"],
    token_type_ids=encoded_sentences["token_type_ids"],
)
out.logits

In [None]:
torch.nn.functional.softmax(out.logits, dim=-1)

# GPT-2

In [27]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tabulate import tabulate

In [None]:
gpt = GPT2LMHeadModel.from_pretrained("gpt2-medium")
gpt.eval()

In [29]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

In [None]:
text = "Paris is the capital of"
tokenized_text = tokenizer.encode(text)
print(tokenizer.decode(tokenized_text, add_special_tokens=True))
print(tokenizer.tokenize(text))

In [31]:
text = "Paris is the capital of"
tokenized_text = tokenizer.encode(text)
input_tensor = torch.tensor([tokenizer.bos_token_id] + tokenized_text)[None, :]
with torch.no_grad():
    out = gpt(input_tensor)
logits = out.logits.detach().numpy()
logits_sorted = np.argsort(logits[0], axis=-1)
top_5_logits = logits_sorted[:, -5:][:, ::-1]

In [None]:
logits.shape

In [33]:
list_tokens = []
input_tokens = [tokenizer.bos_token] + tokenizer.tokenize(text)
for i in range(top_5_logits.shape[0]):
    tokens = [input_tokens[i]] + [tokenizer.decode(tok) for tok in top_5_logits[i]]
    tokens = [t.replace("Ġ", " ") for t in tokens]
    list_tokens.append(tokens)

In [None]:
table = np.array(list_tokens)
print("Encoded inputs:")
print(
    tabulate(
        table,
        headers=["Input tokens"] + [f"Top {i}" for i in range(1, 6)],
        tablefmt="fancy_grid",
    )
)

## Encoder-decoder

In [35]:
from transformers import BartForConditionalGeneration, BartTokenizer

In [36]:
model = BartForConditionalGeneration.from_pretrained(
    "facebook/bart-base", forced_bos_token_id=0
)
model.eval()
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

In [None]:
masked_text = "A mysterious <mask> is located in Britany."
text = "A mysterious fortified castle is located in Britany."
tokenized_masked_text = tokenizer.encode(masked_text, return_tensors="pt")
tokenized_text = tokenizer.encode(text, return_tensors="pt")
print(tokenizer.decode(tokenized_masked_text[0]))

In [None]:
tokenized_masked_text.shape, tokenized_text.shape

In [39]:
out = model(input_ids=tokenized_masked_text, decoder_input_ids=tokenized_text)

In [None]:
out.logits.shape

In [None]:
generated_ids = model.generate(tokenized_masked_text, do_sample=True, max_new_tokens=20)
print(tokenizer.decode(generated_ids[0]))

## Inference

### Greedy decoding

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [None]:
gpt = GPT2LMHeadModel.from_pretrained("gpt2-medium")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
gpt.eval()

In [5]:
import torch

text = "Paris is the capital of"
tokenized_text = tokenizer.encode(text)
input_tensor = torch.tensor([tokenizer.bos_token_id] + tokenized_text)[None, :]

In [None]:
output_tokens = []
gpt.eval()
input_sentence = text
print("Input text:", input_sentence)
input_ids = input_tensor
print("Input tensor:", input_tensor)
with torch.no_grad():
    for i in range(11):
        logits = gpt(input_ids).logits
        next_token = logits[0, -1].argmax()
        input_ids = torch.cat((input_ids, torch.tensor([next_token])[None, :]), dim=-1)
        print("Current text:", tokenizer.decode(input_ids[0]))

In [None]:
generation_output = gpt.generate(
    input_ids=input_tensor, do_sample=True, max_new_tokens=25
)
tokenizer.decode(generation_output[0])