#### 🔴 Understanding `Tokenization` in NLP!

1️⃣ Character Tokenization

In [49]:
raw_text = "We love NLP!"
tokens = list(raw_text)
print(tokens)

['W', 'e', ' ', 'l', 'o', 'v', 'e', ' ', 'N', 'L', 'P', '!']


In [50]:
# Numerical encoding of individual character
token2idx = {char: idx for idx, char in enumerate(sorted(set(tokens)))}
print(token2idx)

{' ': 0, '!': 1, 'L': 2, 'N': 3, 'P': 4, 'W': 5, 'e': 6, 'l': 7, 'o': 8, 'v': 9}


In [51]:
# Using token2idx to map our tokenized text to integers
integer_tokens = [token2idx[token] for token in tokens]
print(integer_tokens)

[5, 6, 0, 7, 8, 9, 6, 0, 3, 2, 4, 1]


In [52]:
# One-hot encoding the numbers
import torch
import torch.nn.functional as F

integer_tokens = torch.tensor(integer_tokens)
one_hot_encode_tokens = F.one_hot(integer_tokens, num_classes=len(token2idx))
one_hot_encode_tokens.shape

torch.Size([12, 10])

In [53]:
print(f"Token = {tokens[0]}")
print(f"Integer Encoded Token = {integer_tokens[0]}")
print(f"One hot encoded Token = {one_hot_encode_tokens[0]}")

Token = W
Integer Encoded Token = 5
One hot encoded Token = tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])


2️⃣ Word tokenization

In [54]:
# Splitting raw text based on whitespaces
word_tokens = raw_text.split()
print(word_tokens)

['We', 'love', 'NLP!']


3️⃣ Subword Tokenization

In [55]:
from transformers import AutoTokenizer

In [56]:
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [57]:
from transformers import DistilBertTokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)

In [58]:
# Lets see the tokenizer in action now 
encoded_text = tokenizer(raw_text)
print(encoded_text)

{'input_ids': [101, 2057, 2293, 17953, 2361, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [59]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

['[CLS]', 'we', 'love', 'nl', '##p', '!', '[SEP]']


3️⃣ Tokenizing entire Dataset

In [32]:
# !pip install datasets

In [60]:
from datasets import load_dataset

In [66]:
# we will load the tweet emotions dataset
tweet_emotions = load_dataset("emotion")
tweet_emotions

Found cached dataset emotion (/Users/pachaar/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [67]:
# Let's define a function for tokenization
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [68]:
print(tokenize(tweet_emotions["train"][:2]))

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [69]:
# Applying tokenization across entire data set
tweet_emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

print(tweet_emotions_encoded['test'].column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

['text', 'label', 'input_ids', 'attention_mask']
