In [8]:
%%capture
!pip install -Uqqq spacy datasets tokenizers plotly
!python -m spacy download en_core_web_sm

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import torch
from torch.utils.data import DataLoader
import pandas as pd
import spacy
from tokenizers.models import WordLevel, BPE, WordPiece
import tokenizers
from datasets import load_dataset
import matplotlib.pyplot as plt
import plotly.express as px
import warnings

In [10]:
warnings.filterwarnings('ignore')

In [11]:
dataset = load_dataset("SetFit/emotion", split='train')
class_names = 'anger fear joy love sadness surprise'.split()
class_lookup = {i:c for i, c in enumerate(class_names)}
class_lookup

In [12]:
sentence1 = 'The quick brown fox jumped over the lazy dog'
sentence2 = 'Deep learning is fun!'
sentence3 = 'deep learning is hard.'

sentences = [sentence1, sentence2, sentence3]

In [15]:
cv = CountVectorizer()
sentences_cv = cv.fit_transform(sentences)
sentences_cv = pd.DataFrame(sentences_cv.toarray(), columns=cv.get_feature_names_out())

sentences_cv

In [16]:
from tokenizers import normalizers

In [17]:
normalizer = normalizers.Sequence([
    normalizers.NFD(),
    normalizers.Lowercase(),
    normalizers.StripAccents()
])

In [18]:
normalizer.normalize_str('Höw aRę ŸõŪ dÔįñg?')

In [19]:
normalized_senences = [normalizer.normalize_str(s) for s in sentences]
normalized_senences

In [20]:
for s in normalized_senences:
    print(s.split())

In [21]:
from tokenizers import pre_tokenizers

In [22]:
pre_tokenizer = pre_tokenizers.Whitespace()
# split our normalized_sentences
split_sentences = [pre_tokenizer.pre_tokenize_str(s) for s in normalized_senences]
split_sentences

In [23]:
UNK_TOKEN = '[UNK]'
PAD_TOKEN = '[PAD]'

tokenizer = tokenizers.Tokenizer(model=tokenizers.models.WordLevel(unk_token=UNK_TOKEN))

In [24]:
tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = pre_tokenizer

In [25]:
trainer = tokenizers.trainers.WordLevelTrainer(vocab_size=30000, min_frequency=10, show_progress=True, special_tokens=[PAD_TOKEN, UNK_TOKEN])

In [26]:
def document_iterator(ds):
    for item in ds:
        yield item['text']

one_document = next(document_iterator(dataset))
one_document

In [27]:
tokenizer.train_from_iterator(document_iterator(dataset), trainer)

print(f"""
Our tokenizer contains {tokenizer.get_vocab_size()} unique tokens.
""")

In [28]:
for i in range(5):
    print(f'ID: {i}, token: {tokenizer.id_to_token(i)}')

In [29]:
encoded = tokenizer.encode(one_document)
pd.DataFrame(zip(encoded.tokens, encoded.ids), columns=['token', 'id']).T

In [30]:
tokenizer.enable_padding(pad_id=tokenizer.token_to_id(PAD_TOKEN), pad_token=PAD_TOKEN)

In [31]:
def collate_fn(batch):
    texts = [i['text'] for i in batch]
    encoded = tokenizer.encode_batch(texts)
    ids = [t.ids for t in encoded]
    labels = [i['label'] for i in batch]
    return torch.tensor(ids), torch.tensor(labels)

In [32]:
dl = DataLoader(dataset, batch_size=8, collate_fn=collate_fn)
encoded_texts, labels = next(iter(dl))

In [33]:
encoded_texts

In [34]:
# pass the dataset directly to the dataloader without a collate_fn
dl = DataLoader(dataset, batch_size=8)

# tokenize each batch after it's loaded in the training loop
for batch in dl:
    encoded = tokenizer.encode_batch(batch['text'])
    input_ids = torch.tensor([document.ids for document in encoded])
    labels = batch['label']
    # We'll also write the rest of the steps,
    # although we're not actually training a model at the moment.
    # logits = model(ids)
    # loss = loss_fn(logits, labels)
    # loss.backward()
    # opt.step()
    # opt.zero_grad()
    break

In [35]:
input_ids

In [36]:
tokenizer.decode_batch(encoded_texts.numpy())