# <center> Create a tokenizer for the Twitter dataset

## Preprocess the data

##### Load the data

In [1]:
from datasets import load_dataset

twitter_data = load_dataset("csv", data_files="data/tweet_emotions.csv",)
twitter_data

Found cached dataset csv (C:/Users/cayde/.cache/huggingface/datasets/csv/default-b904fe45c77dba95/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'sentiment', 'content'],
        num_rows: 40000
    })
})

In [2]:
# View data example
twitter_data["train"][0]

{'tweet_id': 1956967341,
 'sentiment': 'empty',
 'content': '@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =['}

##### Clean the data

In [3]:
import html

# Unescape HTML characters
cleaned_data = twitter_data.map(
    lambda x: {"content": [html.unescape(text) for text in x["content"]]}, batched=True
)

Loading cached processed dataset at C:\Users\cayde\.cache\huggingface\datasets\csv\default-b904fe45c77dba95\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-997fcef482a42195.arrow


In [4]:
# Encode the sentiment labels
encoded_data = cleaned_data.class_encode_column("sentiment")

Loading cached processed dataset at C:\Users\cayde\.cache\huggingface\datasets\csv\default-b904fe45c77dba95\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-4a84aaf94cefa7ed.arrow


##### Split the data (account for label ratios)

In [5]:
# Create train, test split
split_data = encoded_data["train"].train_test_split(
    test_size=.2,   
    shuffle=True,
    seed=42,
    stratify_by_column="sentiment",
) 

Loading cached split indices for dataset at C:\Users\cayde\.cache\huggingface\datasets\csv\default-b904fe45c77dba95\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-4cd3662fd347ca44.arrow and C:\Users\cayde\.cache\huggingface\datasets\csv\default-b904fe45c77dba95\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c27773a2878a1ba6.arrow


In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
# Save dataset to Hub
split_data.push_to_hub(repo_id="twitter-sentiment-classification", private=True)

Pushing split train to the Hub.
Resuming upload of the dataset shards.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.
Resuming upload of the dataset shards.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/830 [00:00<?, ?B/s]

In [6]:
twitter_datasets = load_dataset("cayjobla/twitter-sentiment-classification")

Found cached dataset parquet (C:/Users/cayde/.cache/huggingface/datasets/cayjobla___parquet/cayjobla--twitter-sentiment-classification-5f2b29fe45958d87/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

## Train a new tokenizer

In [7]:
def get_training_corpus():
    dataset = twitter_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["content"]

training_corpus = get_training_corpus()

##### Normalization

In [8]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

tokenizer_obj = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [9]:
tokenizer_obj.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

In [10]:
print(tokenizer_obj.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


##### Pre-Tokenization

In [11]:
tokenizer_obj.pre_tokenizer = pre_tokenizers.Whitespace()

In [12]:
tokenizer_obj.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

##### Train tokenization model

In [15]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=30000, special_tokens=special_tokens)

In [16]:
tokenizer_obj.train_from_iterator(get_training_corpus(), trainer=trainer)

In [17]:
encoding = tokenizer_obj.encode("Let's test this tokenizer.")
print(encoding.tokens)

['let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.']


##### Post-processing

In [18]:
cls_token_id = tokenizer_obj.token_to_id("[CLS]")
sep_token_id = tokenizer_obj.token_to_id("[SEP]")

tokenizer_obj.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [19]:
encoding = tokenizer_obj.encode("Let's test this tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']


In [20]:
encoding = tokenizer_obj.encode("Let's test this tokenizer...", "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '...', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


##### Add a decoder

In [21]:
tokenizer_obj.decoder = decoders.WordPiece(prefix="##")

In [22]:
tokenizer_obj.decode(encoding.ids)

"let ' s test this tokenizer... on a pair of sentences."

##### Save tokenizer

In [23]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast(tokenizer_object=tokenizer_obj)

In [24]:
tokenizer.save_pretrained("nlp/tokenizer")

('nlp/tokenizer\\tokenizer_config.json',
 'nlp/tokenizer\\special_tokens_map.json',
 'nlp/tokenizer\\vocab.txt',
 'nlp/tokenizer\\added_tokens.json',
 'nlp/tokenizer\\tokenizer.json')

##### Train from existing tokenizer

In [13]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
old_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [14]:
training_corpus = get_training_corpus()
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 30000)

In [15]:
tokenizer.save_pretrained("nlp/twitter-tokenizer")

('nlp/twitter-tokenizer\\tokenizer_config.json',
 'nlp/twitter-tokenizer\\special_tokens_map.json',
 'nlp/twitter-tokenizer\\vocab.txt',
 'nlp/twitter-tokenizer\\added_tokens.json',
 'nlp/twitter-tokenizer\\tokenizer.json')