# <center> Create a tokenizer for the Twitter dataset

## Preprocess the data

##### Load the data

In [1]:
from datasets import load_dataset

twitter_data = load_dataset("csv", data_files="data/original_data.csv")
twitter_data

Found cached dataset csv (C:/Users/cayde/.cache/huggingface/datasets/csv/default-e1ad4f0191b5b012/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'sentiment', 'content'],
        num_rows: 40000
    })
})

In [2]:
twitter_data["train"][0]

{'tweet_id': 1956967341,
 'sentiment': 'empty',
 'content': '@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =['}

In [3]:
shuffled_data = twitter_data.shuffle(seed=42)
shuffled_data["train"][0]

Loading cached shuffled indices for dataset at C:\Users\cayde\.cache\huggingface\datasets\csv\default-e1ad4f0191b5b012\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-9dc688d02f83d5c9.arrow


{'tweet_id': 1694746590,
 'sentiment': 'neutral',
 'content': '@WayneLiew just need to know what to be cautious about.  being cautious is good.'}

In [6]:
# Create train, test split
split_data = shuffled_data["train"].train_test_split(test_size=.2)
split_data

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'sentiment', 'content'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['tweet_id', 'sentiment', 'content'],
        num_rows: 8000
    })
})

In [7]:
split_data["test"][0]

{'tweet_id': 1964053218,
 'sentiment': 'worry',
 'content': "@chelseytx I don't have the app that does it"}

In [8]:
import html

# Unescape HTML characters
twitter_datasets = split_data.map(
    lambda x: {"content": [html.unescape(text) for text in x["content"]]}, batched=True
)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [9]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
# Save dataset to Hub
twitter_datasets.push_to_hub(repo_id="twitter-sentiment-classification", private=True)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

## Train a new tokenizer

In [11]:
def get_training_corpus():
    dataset = twitter_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["content"]

training_corpus = get_training_corpus()

##### Normalization

In [12]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

tokenizer_obj = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [13]:
tokenizer_obj.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

In [14]:
print(tokenizer_obj.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


##### Pre-Tokenization

In [15]:
tokenizer_obj.pre_tokenizer = pre_tokenizers.Whitespace()

In [16]:
tokenizer_obj.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

##### Train tokenization model

In [17]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=30000, special_tokens=special_tokens)

In [18]:
tokenizer_obj.train_from_iterator(get_training_corpus(), trainer=trainer)

In [19]:
encoding = tokenizer_obj.encode("Let's test this tokenizer.")
print(encoding.tokens)

['let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.']


##### Post-processing

In [20]:
cls_token_id = tokenizer_obj.token_to_id("[CLS]")
sep_token_id = tokenizer_obj.token_to_id("[SEP]")

tokenizer_obj.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [21]:
encoding = tokenizer_obj.encode("Let's test this tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']


In [22]:
encoding = tokenizer_obj.encode("Let's test this tokenizer...", "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '...', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


##### Add a decoder

In [23]:
tokenizer_obj.decoder = decoders.WordPiece(prefix="##")

In [24]:
tokenizer_obj.decode(encoding.ids)

"let ' s test this tokenizer... on a pair of sentences."

##### Save tokenizer

In [28]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast(tokenizer_object=tokenizer_obj)

In [31]:
tokenizer.save_pretrained("nlp/tokenizer")

('nlp/tokenizer\\tokenizer_config.json',
 'nlp/tokenizer\\special_tokens_map.json',
 'nlp/tokenizer\\vocab.txt',
 'nlp/tokenizer\\added_tokens.json',
 'nlp/tokenizer\\tokenizer.json')