In [None]:
from transformers import pipeline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import tqdm

In [None]:
device = 'mps'

## 2. Using Transformer

### 2.2 Behind the pipeline

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

In [None]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)
seq_class = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
model.to(device);
seq_class.to(device);

In [None]:
total = 0
for n, p in model.named_parameters():
    total += p.numel()
total

In [None]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt").to(device)

In [None]:
inputs['input_ids']

In [None]:
inputs['input_ids'].shape

In [None]:
inputs['attention_mask']

In [None]:
inputs['attention_mask'].shape

In [None]:
outputs = model(**inputs)

In [None]:
outputs.last_hidden_state.shape

In [None]:
outputs = seq_class(**inputs)

In [None]:
outputs.logits

In [None]:
scipy.special.softmax(outputs.logits.detach().cpu(), axis=1)

In [None]:
seq_class.config.id2label

### 2.3 Models

In [None]:
from transformers import BertConfig, BertTokenizer, BertModel

In [None]:
# Building the config
config = BertConfig()

# Building the model from the config
model = BertModel(config)

In [None]:
config

In [None]:
name_params = []
for n, p in model.named_parameters():
    name_params.append([n, p.numel()])

In [None]:
df = pd.DataFrame(name_params, columns=['names', 'parameters'])

In [None]:
df['parameters'].sum()

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
bert_model = BertModel.from_pretrained('bert-base-cased')

In [None]:
name_params = []
for n, p in bert_model.named_parameters():
    name_params.append([n, p.numel()])

In [None]:
df = pd.DataFrame(name_params, columns=['names', 'parameters'])

In [None]:
df['parameters'].sum()

In [None]:
bert_model.to(device);

In [None]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs_bert = bert_tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt").to(device)

In [None]:
inputs['input_ids'].shape

In [None]:
inputs_bert['input_ids'].shape

In [None]:
outputs = bert_model(**inputs_bert)

In [None]:
outputs.last_hidden_state.shape

In [None]:
#bert.save_pretrained('test')

### 2.4 Tokenizers

In [None]:
from transformers import BertTokenizer

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
sentence = 'Using a Transformer network is simple'

In [None]:
inputs = bert_tokenizer(sentence)

In [None]:
inputs

In [None]:
#bert_tokenizer.save_pretrained('test')

In [None]:
vocab = bert_tokenizer.vocab

In [None]:
vocab_sort = np.sort(list(vocab.keys()))

In [None]:
vocab_sort.shape

In [None]:
vocab_lower = np.unique(np.char.lower(vocab_sort[7245:-918]))

In [None]:
vocab_lower.shape

In [None]:
sentence = 'Using a Transformer network is simple'

In [None]:
tokens = bert_tokenizer.tokenize(sentence)

In [None]:
tokens

In [None]:
ids = bert_tokenizer.convert_tokens_to_ids(tokens)

In [None]:
ids

In [None]:
decoded_string = bert_tokenizer.decode(ids)

In [None]:
decoded_string

### 2.5 Handling multiple sequences

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
model.to(device);

In [None]:
sequence = "I've been waiting for a HuggingFace course my whole life."

In [None]:
tokens = tokenizer.tokenize(sequence)

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)

In [None]:
input_ids = torch.tensor([ids]).to(device)

In [None]:
input_ids

In [None]:
outputs = model(input_ids)

In [None]:
outputs.logits

In [None]:
inputs = tokenizer(sequence, return_tensors='pt').to(device)

In [None]:
outputs = model(**inputs)

In [None]:
outputs.logits

In [None]:
model(torch.cat([input_ids, input_ids], dim=0)).logits

In [None]:
tokenizer.pad_token_id, tokenizer.pad_token

In [None]:
seq1 = [[200, 200, 200]]
seq2 = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

In [None]:
for seq in [seq1, seq2, batched_ids]:
    print (model(torch.tensor(seq).to(device)).logits)

In [None]:
attention_mask = [
    [1, 1, 1],
    [1, 1, 0]
]

In [None]:
(model(torch.tensor(batched_ids).to(device), attention_mask=torch.tensor(attention_mask).to(device)).logits)

In [None]:
tokenizer.model_max_length

### 2.6 Putting it all together

In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

In [None]:
model.to(device);

In [None]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

In [None]:
# same as padding = True
inputs1 = tokenizer(sequences, padding='longest', return_tensors='pt').to(device)

In [None]:
inputs1['input_ids'].shape

In [None]:
inputs2 = tokenizer(sequences, padding='max_length', return_tensors='pt').to(device)

In [None]:
inputs2['input_ids'].shape

In [None]:
# pad sequences up to a specified max length
inputs3 = tokenizer(sequences, padding='max_length', max_length=8)

In [None]:
# first sequence: the number of tokens is 16 so 0 padding are added to be 16
# second sequence: the number of tokens is 6 so 2 padding are added to be 8
len(inputs3['input_ids'][0]), len(inputs3['input_ids'][1])

In [None]:
inputs4 = tokenizer(sequences, truncation=True)

In [None]:
len(inputs4['input_ids'][0]), len(inputs4['input_ids'][1])

In [None]:
inputs5 = tokenizer(sequences, truncation=True, max_length=8)

In [None]:
# first sequence: the number of tokens is 16 and the max is 8 so the everything after 7 was truncated and an end token was added at the end
# second sequence: the number of tokens is 6 and the max is 8 so nothing was truncated
len(inputs5['input_ids'][0]), len(inputs5['input_ids'][1])

In [None]:
inputs6 = tokenizer(sequences, padding=True, truncation=True, return_tensors='pt')

In [None]:
inputs7 = tokenizer(sequences, padding=True, truncation=True, return_tensors='tf')

In [None]:
inputs8 = tokenizer(sequences, padding=True, truncation=True, return_tensors='np')

In [None]:
inputs6['input_ids'].dtype, inputs7['input_ids'].dtype, inputs8['input_ids'].dtype

In [None]:
sequence = sequences[0]

In [None]:
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

In [None]:
# token 101 [CLS] and 102 [SEP] are used to begin and end a sequence
# not all tokenizers have special tokens
inputs8['input_ids'][0], np.array(ids)

In [None]:
tokenizer.decode(inputs8['input_ids'][0]), tokenizer.decode(ids)