<h2 align="center">NLP Tutorial: Tokenizers in Hugging Face</h2>

In [19]:
from transformers import DistilBertTokenizer, AutoTokenizer

### DistilBERT Tokenizer

In [20]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = DistilBertTokenizer.from_pretrained(model_name)

text = "Happiness lies within you"
output = tokenizer(text)
output

{'input_ids': [101, 8404, 3658, 2306, 2017, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}

### BERT Tokenizer

In [21]:
model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
text = "Happiness lies within you"

output = tokenizer(text)
output

{'input_ids': [101, 8404, 3658, 2306, 2017, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [23]:
tokenizer.decode(output['input_ids'])

'[CLS] happiness lies within you [SEP]'

In [24]:
tokens = tokenizer.convert_ids_to_tokens(output['input_ids'])
tokens

['[CLS]', 'happiness', 'lies', 'within', 'you', '[SEP]']

### Special token ids

In [27]:
tokenizer.cls_token_id

101

In [28]:
tokenizer.sep_token_id

102

In [29]:
tokenizer.pad_token_id

0

In [30]:
texts = [
    "Happiness lies within you",
    "I love nature"
]

In [31]:
tokenizer(texts)

{'input_ids': [[101, 8404, 3658, 2306, 2017, 102], [101, 1045, 2293, 3267, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

### padding and truncation

In [32]:
tokenizer(texts, padding=True, return_tensors='pt')

{'input_ids': tensor([[ 101, 8404, 3658, 2306, 2017,  102],
        [ 101, 1045, 2293, 3267,  102,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0]])}

In [33]:
tokenizer(texts, padding='max_length', max_length=5, truncation=True, return_tensors='pt')

{'input_ids': tensor([[ 101, 8404, 3658, 2306,  102],
        [ 101, 1045, 2293, 3267,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}

In [34]:
tokenizer(texts, padding='max_length', max_length=20, truncation=True, return_tensors='pt')

{'input_ids': tensor([[ 101, 8404, 3658, 2306, 2017,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 1045, 2293, 3267,  102,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

### Supplying tokens to a model

In [35]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = [
    "That phone case broke after 2 days of use", 
    "That herbel tea has helped me so much"
]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
output

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

SequenceClassifierOutput(loss=None, logits=tensor([[ 4.0561, -3.2456],
        [-3.6340,  3.8584]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [36]:
import torch
import torch.nn.functional as F

probs = F.softmax(output.logits, dim=-1)
probs

tensor([[9.9933e-01, 6.7395e-04],
        [5.5700e-04, 9.9944e-01]], grad_fn=<SoftmaxBackward0>)

In [37]:
predicted_classes = torch.argmax(probs, dim=1).tolist()
predicted_classes

[0, 1]

input text ==> tokenizer ==> tokens(token ids) ==> model ==> logits ==> post processing ==> output text

Previously when we used HuggingFace pipeline we were able to do all of this with just one line of code. Above code explains the inner workings of the pipeline.

In [38]:
from transformers import pipeline
pipe = pipeline("sentiment-analysis")
pipe("My dog is cute")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f.
Using a pipeline without specifying a model name and revision in production is not recommended.


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

[{'label': 'POSITIVE', 'score': 0.9997941851615906}]