In [None]:
!pip install transformers

In [2]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = "I am passionate about Data Science"

model_inputs = tokenizer(sequence)
model_inputs

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

{'input_ids': [101, 1045, 2572, 13459, 2055, 2951, 2671, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [4]:
sequences = [
  "I want to be a Data Scientist",
  "It's my passion!"
]

model_inputs = tokenizer(sequences)
model_inputs

{'input_ids': [[101, 1045, 2215, 2000, 2022, 1037, 2951, 7155, 102], [101, 2009, 1005, 1055, 2026, 6896, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

#### Use of max_length & padding

In [7]:
# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")
print('Longest :' , model_inputs)

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")
print('padding = max_length :' , model_inputs)

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=10)
print('padding = max_length, max_length = 8 :' , model_inputs)

Longest : {'input_ids': [[101, 1045, 2215, 2000, 2022, 1037, 2951, 7155, 102], [101, 2009, 1005, 1055, 2026, 6896, 999, 102, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0]]}
padding = max_length : {'input_ids': [[101, 1045, 2215, 2000, 2022, 1037, 2951, 7155, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#### Use of Trunctaion & max_length parameter

In [9]:
sequences = [
  "My name is Dipankar Dey",
  "I am a Machine Learning Engineer"
]

# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)
print(model_inputs)

# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=4, truncation=True)
print(model_inputs)

{'input_ids': [[101, 2026, 2171, 2003, 16510, 2319, 6673, 2139, 2100, 102], [101, 1045, 2572, 1037, 3698, 4083, 3992, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}
{'input_ids': [[101, 2026, 2171, 102], [101, 1045, 2572, 102]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1]]}


#### Use of return_tensor

In [13]:
sequences = [
  "My name is Dipankar Dey",
  "I am a Machine Learning Engineer"
]

# Returns PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
print('Pytorch :',model_inputs,'\n')

# Returns TensorFlow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")
print('Tensorflow :',model_inputs,'\n')

# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")
print('Numpy array :',model_inputs)

Pytorch : {'input_ids': tensor([[  101,  2026,  2171,  2003, 16510,  2319,  6673,  2139,  2100,   102],
        [  101,  1045,  2572,  1037,  3698,  4083,  3992,   102,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])} 

Tensorflow : {'input_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[  101,  2026,  2171,  2003, 16510,  2319,  6673,  2139,  2100,
          102],
       [  101,  1045,  2572,  1037,  3698,  4083,  3992,   102,     0,
            0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=int32)>} 

Numpy array : {'input_ids': array([[  101,  2026,  2171,  2003, 16510,  2319,  6673,  2139,  2100,
          102],
       [  101,  1045,  2572,  1037,  3698,  4083,  3992,   102,     0,
            0]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 

In [14]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
  "My name is Dipankar Dey",
  "I am a Machine Learning Engineer"
]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="tf")
output = model(**tokens)
output

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


TFSequenceClassifierOutput([('logits',
                             <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
                             array([[-2.2055423,  2.4020097],
                                    [-2.8740532,  2.9829888]], dtype=float32)>)])

In [17]:
tf.math.softmax(output.logits)

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.00987767, 0.99012226],
       [0.00285153, 0.99714845]], dtype=float32)>